.= 'tag.htm'; break; case 'flag': $pre .= $default_pre .= 'flag.htm'; break; case 'my': $pre .= $default_pre .= 'my.htm'; break; case 'my_password': $pre .= $default_pre .= 'my_password.htm'; break; case 'my_bind': $pre .= $default_pre .= 'my_bind.htm'; break; case 'my_avatar': $pre .= $default_pre .= 'my_avatar.htm'; break; case 'home_article': $pre .= $default_pre .= 'home_article.htm'; break; case 'home_comment': $pre .= $default_pre .= 'home_comment.htm'; break; case 'user': $pre .= $default_pre .= 'user.htm'; break; case 'user_login': $pre .= $default_pre .= 'user_login.htm'; break; case 'user_create': $pre .= $default_pre .= 'user_create.htm'; break; case 'user_resetpw': $pre .= $default_pre .= 'user_resetpw.htm'; break; case 'user_resetpw_complete': $pre .= $default_pre .= 'user_resetpw_complete.htm'; break; case 'user_comment': $pre .= $default_pre .= 'user_comment.htm'; break; case 'single_page': $pre .= $default_pre .= 'single_page.htm'; break; case 'search': $pre .= $default_pre .= 'search.htm'; break; case 'operate_sticky': $pre .= $default_pre .= 'operate_sticky.htm'; break; case 'operate_close': $pre .= $default_pre .= 'operate_close.htm'; break; case 'operate_delete': $pre .= $default_pre .= 'operate_delete.htm'; break; case 'operate_move': $pre .= $default_pre .= 'operate_move.htm'; break; case '404': $pre .= $default_pre .= '404.htm'; break; case 'read_404': $pre .= $default_pre .= 'read_404.htm'; break; case 'list_404': $pre .= $default_pre .= 'list_404.htm'; break; default: $pre .= $default_pre .= theme_mode_pre(); break; } if ($config['theme']) { $conffile = APP_PATH . 'view/template/' . $config['theme'] . '/conf.json'; $json = is_file($conffile) ? xn_json_decode(file_get_contents($conffile)) : array(); } !empty($json['installed']) and $path_file = APP_PATH . 'view/template/' . $config['theme'] . '/htm/' . ($id ? $id . '_' : '') . $pre; (empty($path_file) || !is_file($path_file)) and $path_file = APP_PATH . 'view/template/' . $config['theme'] . '/htm/' . $pre; if (!empty($config['theme_child']) && is_array($config['theme_child'])) { foreach ($config['theme_child'] as $theme) { if (empty($theme) || is_array($theme)) continue; $path_file = APP_PATH . 'view/template/' . $theme . '/htm/' . ($id ? $id . '_' : '') . $pre; !is_file($path_file) and $path_file = APP_PATH . 'view/template/' . $theme . '/htm/' . $pre; } } !is_file($path_file) and $path_file = APP_PATH . ($dir ? 'plugin/' . $dir . '/view/htm/' : 'view/htm/') . $default_pre; return $path_file; } function theme_mode_pre($type = 0) { global $config; $mode = $config['setting']['website_mode']; $pre = ''; if (1 == $mode) { $pre .= 2 == $type ? 'portal_category.htm' : 'portal.htm'; } elseif (2 == $mode) { $pre .= 2 == $type ? 'flat_category.htm' : 'flat.htm'; } else { $pre .= 2 == $type ? 'index_category.htm' : 'index.htm'; } return $pre; } ?>python - How to enforce string type when reading CSVs in DuckDB? - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

python - How to enforce string type when reading CSVs in DuckDB? - Stack Overflow

programmeradmin1浏览0评论

I'm trying to read a CSV in DuckDB (Python) but one of the records has a string ID while the rest of the file has the ID field as an integer.

I'd like to force reading it as a string, how to do it?

I'm trying to read a CSV in DuckDB (Python) but one of the records has a string ID while the rest of the file has the ID field as an integer.

I'd like to force reading it as a string, how to do it?

Share Improve this question edited Jan 30 at 18:25 jarlh 44.8k8 gold badges50 silver badges67 bronze badges asked Jan 30 at 16:30 VzzarrVzzarr 5,7204 gold badges58 silver badges100 bronze badges 2
  • 1) In a CSV all the record values will be strings and read as such. What you are referring to is DuckDB using AutoDetection to infer data types from the string values and then casting the strings as those types. 2) Is the one record ID value not compatible with casting to integer? – Adrian Klaver Commented Jan 30 at 18:01
  • @AdrianKlaver 1) correct, that's yet another alternative and more correct way of stating things - I'm new to DuckDB so still familiarising with these concepts. But again feel free to post in a new answer. 2) In some of the intermediate systems I'm working with they mixed strings and integers :( so I had to go string as more generic – Vzzarr Commented Feb 2 at 13:41
Add a comment  | 

2 Answers 2

Reset to default 3

Using stations-2023-09.csv from here Netherlands Stations:

SELECT * from read_csv('~/Downloads/stations-2023-09.csv') limit 5;

┌───────┬─────────┬─────────┬────────────┬──────────────────┬───┬──────────────────────┬─────────┬──────────────────────┬─────────────────┬─────────────────┐
│  id   │  code   │   uic   │ name_short │   name_medium    │ … │         slug         │ country │         type         │     geo_lat     │     geo_lng     │
│ int64 │ varchar │  int64  │  varchar   │     varchar      │   │       varchar        │ varchar │       varchar        │     double      │     double      │
├───────┼─────────┼─────────┼────────────┼──────────────────┼───┼──────────────────────┼─────────┼──────────────────────┼─────────────────┼─────────────────┤
│   266 │ HT      │ 8400319 │ Den Bosch  │ 's-Hertogenbosch │ … │ s-hertogenbosch      │ NL      │ knooppuntIntercity…  │        51.69048 │         5.29362 │
│   269 │ HTO     │ 8400320 │ Dn Bosch O │ 's-Hertogenb. O. │ … │ s-hertogenbosch-oost │ NL      │ stoptreinstation     │ 51.700553894043 │ 5.3183331489563 │
│   227 │ HDE     │ 8400388 │ 't Harde   │ 't Harde         │ … │ t-harde              │ NL      │ stoptreinstation     │      52.4091682 │        5.893611 │
│     8 │ AHBF    │ 8015345 │ Aachen     │ Aachen Hbf       │ … │ aachen-hbf           │ D       │ knooppuntIntercity…  │         50.7678 │        6.091499 │
│   818 │ AW      │ 8015199 │ Aachen W   │ Aachen West      │ … │ aachen-west          │ D       │ stoptreinstation     │        50.78036 │        6.070715 │
├───────┴─────────┴─────────┴────────────┴──────────────────┴───┴──────────────────────┴─────────┴──────────────────────┴─────────────────┴─────────────────┤
│ 5 rows                                                                                                                              11 columns (10 shown) │
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

SELECT * from read_csv('~/Downloads/stations-2023-09.csv', types={'id': 'VARCHAR'}) limit 5;

┌─────────┬─────────┬─────────┬────────────┬───┬──────────────────────┬─────────┬──────────────────────┬─────────────────┬─────────────────┐
│   id    │  code   │   uic   │ name_short │ … │         slug         │ country │         type         │     geo_lat     │     geo_lng     │
│ varchar │ varchar │  int64  │  varchar   │   │       varchar        │ varchar │       varchar        │     double      │     double      │
├─────────┼─────────┼─────────┼────────────┼───┼──────────────────────┼─────────┼──────────────────────┼─────────────────┼─────────────────┤
│ 266     │ HT      │ 8400319 │ Den Bosch  │ … │ s-hertogenbosch      │ NL      │ knooppuntIntercity…  │        51.69048 │         5.29362 │
│ 269     │ HTO     │ 8400320 │ Dn Bosch O │ … │ s-hertogenbosch-oost │ NL      │ stoptreinstation     │ 51.700553894043 │ 5.3183331489563 │
│ 227     │ HDE     │ 8400388 │ 't Harde   │ … │ t-harde              │ NL      │ stoptreinstation     │      52.4091682 │        5.893611 │
│ 8       │ AHBF    │ 8015345 │ Aachen     │ … │ aachen-hbf           │ D       │ knooppuntIntercity…  │         50.7678 │        6.091499 │
│ 818     │ AW      │ 8015199 │ Aachen W   │ … │ aachen-west          │ D       │ stoptreinstation     │        50.78036 │        6.070715 │
├─────────┴─────────┴─────────┴────────────┴───┴──────────────────────┴─────────┴──────────────────────┴─────────────────┴─────────────────┤
│ 5 rows                                                                                                              11 columns (9 shown) │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘


In the second case types={'id': 'VARCHAR'} was used to override the autodetect and cast the values in the id column into varchar instead of int64. The other columns where left alone.

UPDATE

The above works in the DuckDB CLI, but not in the Python API. There types is changed to dtype. For the Python case the working example is:

import duckdb

duckdb.read_csv('/home/aklaver/Downloads/stations-2023-09.csv', dtype={"id": "VARCHAR"}).limit(5)

┌─────────┬─────────┬─────────┬────────────┬───┬──────────────────────┬─────────┬──────────────────────┬─────────────────┬─────────────────┐
│   id    │  code   │   uic   │ name_short │ … │         slug         │ country │         type         │     geo_lat     │     geo_lng     │
│ varchar │ varchar │  int64  │  varchar   │   │       varchar        │ varchar │       varchar        │     double      │     double      │
├─────────┼─────────┼─────────┼────────────┼───┼──────────────────────┼─────────┼──────────────────────┼─────────────────┼─────────────────┤
│ 266     │ HT      │ 8400319 │ Den Bosch  │ … │ s-hertogenbosch      │ NL      │ knooppuntIntercity…  │        51.69048 │         5.29362 │
│ 269     │ HTO     │ 8400320 │ Dn Bosch O │ … │ s-hertogenbosch-oost │ NL      │ stoptreinstation     │ 51.700553894043 │ 5.3183331489563 │
│ 227     │ HDE     │ 8400388 │ 't Harde   │ … │ t-harde              │ NL      │ stoptreinstation     │      52.4091682 │        5.893611 │
│ 8       │ AHBF    │ 8015345 │ Aachen     │ … │ aachen-hbf           │ D       │ knooppuntIntercity…  │         50.7678 │        6.091499 │
│ 818     │ AW      │ 8015199 │ Aachen W   │ … │ aachen-west          │ D       │ stoptreinstation     │        50.78036 │        6.070715 │
├─────────┴─────────┴─────────┴────────────┴───┴──────────────────────┴─────────┴──────────────────────┴─────────────────┴─────────────────┤
│ 5 rows                                                                                                              11 columns (9 shown) 

The DuckDB function read_csv() has a parameter all_varchar that when se to True allows to infer all the fields as string:

import duckdb


q = duckdb.read_csv(all_varchar=True, path_or_buffer=f'/my_path/my_csv.csv')

sql = duckdb.sql("""
    SELECT * 
    FROM q
""")

sql.show(max_width=100, max_rows=1000)

allowing me to overcome the issue on inconsistent type for the ID I was trying to read, for reference: https://duckdb./docs/data/csv/overview.html#parameters

发布评论

评论列表(0)

  1. 暂无评论