云计算百科
云计算领域专业知识百科平台

不同DuckDB插件对不同工具生成的xlsx文件读取速度的比较

上文提到的SharedString格式对读取性能的影响如何,下面用实验来说明。
1.生成两种格式文件
DuckDB的excel插件生成不带SharedString格式的xlsx文件,如下所示

load excel
copy (from read_csv('NYC_311_SR_2010-2020-sample-1M.csv'))to 'ducknyc.xlsx' WITH (FORMAT xlsx, HEADER true);
Run Time (s): real 62.449 user 59.112000 sys 0.852000

writexlsx库生成带SharedString格式的xlsx文件,我让DeepSeek分别用c和python编写了把csv文件转换为xlsx的程序,如下所示:

#include "xlsxwriter.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_LINE_LENGTH 1024
#define MAX_FIELD_LENGTH 256

int main(int argc, char *argv[]) {
if (argc != 3) {
printf("Usage: %s <input.csv> <output.xlsx>\\n", argv[0]);
return 1;
}

const char *input_csv = argv[1];
const char *output_xlsx = argv[2];

// 打开 CSV 文件
FILE *csv_file = fopen(input_csv, "r");
if (!csv_file) {
printf("Error: Could not open CSV file '%s'\\n", input_csv);
return 1;
}

// 创建工作簿和工作表
lxw_workbook *workbook = workbook_new(output_xlsx);
lxw_worksheet *worksheet = workbook_add_worksheet(workbook, NULL);

char line[MAX_LINE_LENGTH];
int row = 0;

// 逐行读取 CSV 文件
while (fgets(line, sizeof(line), csv_file)) {
// 移除换行符
line[strcspn(line, "\\n")] = '\\0';

char *token;
int col = 0;

// 使用逗号分隔字段
token = strtok(line, ",");
while (token != NULL) {
// 尝试将字段转换为数字
char *endptr;
double number = strtod(token, &endptr);

if (*endptr == '\\0') {
// 如果是数字,写入数字
worksheet_write_number(worksheet, row, col, number, NULL);
} else {
// 否则写入字符串
worksheet_write_string(worksheet, row, col, token, NULL);
}

token = strtok(NULL, ",");
col++;
}

row++;
}

// 关闭文件
fclose(csv_file);

// 保存并关闭工作簿
lxw_error error = workbook_close(workbook);
if (error) {
printf("Error: Failed to create XLSX file: %s\\n", lxw_strerror(error));
return 1;
}

printf("Successfully converted '%s' to '%s'\\n", input_csv, output_xlsx);
return 0;
}

编译执行如下

git clone –depth=1 https://github.com/jmcnamara/libxlsxwriter
cd libxlsxwriter
make
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/par/libxlsxwriter/lib
gcc c2xx.c -o c2xx -lxlsxwriter -I /par/libxlsxwriter/include -L /par/libxlsxwriter/lib -O3

time ./c2xx nyc1m.csv cnyc.xlsx
Successfully converted 'nyc1m.csv' to 'cnyc.xlsx'

real0m29.910s
user0m28.432s
sys0m1.420s

ls -la cnyc.xlsx
-rw-r–r– 1 root root 188461713 Aug 15 09:10 cnyc.xlsx

c程序没有考虑转义字符的问题,转成的xlsx文件有点问题,但稀奇的是,sheetreader插件居然能识别。

import csv
import xlsxwriter
import sys

def csv_to_xlsx(input_csv, output_xlsx):
"""
将 CSV 文件转换为 XLSX 文件

参数:
input_csv (str): 输入的 CSV 文件路径
output_xlsx (str): 输出的 Excel 文件路径
"""
try:
# 创建工作簿和工作表
workbook = xlsxwriter.Workbook(output_xlsx)
worksheet = workbook.add_worksheet()

# 设置格式 – 可选
bold_format = workbook.add_format({'bold': True})
date_format = workbook.add_format({'num_format': 'yyyy-mm-dd'})

# 打开 CSV 文件
with open(input_csv, 'r', encoding='utf-8') as csv_file:
csv_reader = csv.reader(csv_file)

for row_idx, row in enumerate(csv_reader):
for col_idx, value in enumerate(row):
try:
# 尝试转换为浮点数
num_value = float(value)
if num_value.is_integer():
# 如果是整数,写入整数
worksheet.write_number(row_idx, col_idx, int(num_value))
else:
# 如果是浮点数,写入浮点数
worksheet.write_number(row_idx, col_idx, num_value)
except ValueError:
# 如果不能转换为数字,写入字符串
worksheet.write_string(row_idx, col_idx, value)

# 关闭工作簿
workbook.close()
print(f"成功将 '{input_csv}' 转换为 '{output_xlsx}'")

except FileNotFoundError:
print(f"错误: 找不到输入文件 '{input_csv}'")
sys.exit(1)
except Exception as e:
print(f"发生错误: {str(e)}")
sys.exit(1)

if __name__ == "__main__":
if len(sys.argv) != 3:
print("用法: python csv_to_xlsx.py <input.csv> <output.xlsx>")
sys.exit(1)

input_csv = sys.argv[1]
output_xlsx = sys.argv[2]

csv_to_xlsx(input_csv, output_xlsx)

执行如下

python3 pip.pyz install xlsxwriter
time python3 c2xx.py "NYC_311_SR_2010-2020-sample-1M.csv" pynyc.xlsx
成功将 'NYC_311_SR_2010-2020-sample-1M.csv' 转换为 'pynyc.xlsx'

real6m13.559s
user6m4.476s
sys0m5.680s

因为python有内置的csv读取函数,所以它正确地处理了转义,得到的xlsx文件格式正确。
xlsx文件及其中SharedString和sheet1.xml大小如下表所示:

转换工具xlsxSharedStringsheet1
duckdb excel 249MB 0 1.7GB
c 188MB 120MB 1.1GB
python 220MB 107MB 1.5GB

2.用不同插件读取
2.1用excel插件

D load excel;
–不带all_varchar=1选项读取python生成的文件失败
D select count(*) from 'pynyc.xlsx';
0% ▕ ▏ Run Time (s): real 3.805 user 7.372000 sys 0.132000
Invalid Input Error:
read_xlsx: Failed to parse cell 'I4018': Could not convert string '' to DOUBLE
–带all_varchar=1选项读取成功
D select count(*) from read_xlsx('pynyc.xlsx',all_varchar=1);
┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000000
(1.00 million)
└────────────────┘
Run Time (s): real 48.707 user 95.948000 sys 0.272000
header=1相当于自动判断格式,header=0相当于all_varchar,因为列标题都是字符串。

Run Time (s): real 0.111 user 0.176000 sys 0.016000
D select count(*) from read_xlsx('pynyc.xlsx',header=1);
0% ▕ ▏ Run Time (s): real 3.539 user 6.880000 sys 0.092000
Invalid Input Error:
read_xlsx: Failed to parse cell 'I4018': Could not convert string '' to DOUBLE
D select count(*) from read_xlsx('ducknyc.xlsx',header=1);
Run Time (s): real 3.411 user 5.300000 sys 0.312000
Invalid Input Error:
read_xlsx: Failed to parse cell 'N4': Could not convert string 'WILLOUGHBY AVENUE' to DOUBLE
D select count(*) from read_xlsx('cnyc.xlsx',header=1);
0% ▕ ▏ Run Time (s): real 3.804 user 7.516000 sys 0.084000
Invalid Input Error:
read_xlsx: Failed to parse cell 'I10': Could not convert string '1305 EAST 19 STREET' to DOUBLE
D
D select count(*) from read_xlsx('pynyc.xlsx',header=0);

┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000001
(1.00 million)
└────────────────┘
Run Time (s): real 48.054 user 51.680000 sys 0.192000
D select count(*) from read_xlsx('ducknyc.xlsx',header=0);

┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000001
(1.00 million)
└────────────────┘
Run Time (s): real 44.848 user 87.600000 sys 0.444000
D select count(*) from read_xlsx('cnyc.xlsx',header=0);

┌──────────────┐
│ count_star()
│ int64 │
├──────────────┤
296663
└──────────────┘
Run Time (s): real 13.064 user 25.484000 sys 0.100000
D

2.2用rusty_sheet插件

D select count(*) from read_sheet('pynyc.xlsx',header=1);

┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000000
(1.00 million)
└────────────────┘
Run Time (s): real 76.280 user 71.944000 sys 4.160000
D select count(*) from read_sheet('ducknyc.xlsx',header=1);

┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000000
(1.00 million)
└────────────────┘
Run Time (s): real 46.921 user 57.200000 sys 0.640000
D select count(*) from read_sheet('cnyc.xlsx',header=1);
Run Time (s): real 38.976 user 38.220000 sys 0.656000
Binder Error:
Read spreadsheet failed: Missing column name at 'AP1'
D
D select count(*) from read_sheet('pynyc.xlsx',header=0);

┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000001
(1.00 million)
└────────────────┘
Run Time (s): real 77.455 user 93.204000 sys 4.148000
D select count(*) from read_sheet('ducknyc.xlsx',header=0);

┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000001
(1.00 million)
└────────────────┘
Run Time (s): real 49.145 user 61.764000 sys 0.704000
D select count(*) from read_sheet('cnyc.xlsx',header=0);

┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000217
(1.00 million)
└────────────────┘
Run Time (s): real 56.306 user 72.896000 sys 0.644000

2.3用sheetreader插件,因最高只支持Duckdb 1.2.2,故采用该版本测试。

./duckdb122
v1.2.2 7c039464e4
D .timer on
D load sheetreader;

D select count(*) from sheetreader('pynyc.xlsx',has_header=1);
┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000000
(1.00 million)
└────────────────┘
Run Time (s): real 17.722 user 46.544000 sys 0.368000
D select count(*) from sheetreader('ducknyc.xlsx',has_header=1);
Run Time (s): real 10.142 user 45.000000 sys 0.544000
Invalid Error:
Failed to retrieve shared strings file
D select count(*) from sheetreader('cnyc.xlsx',has_header=1);
Run Time (s): real 6.426 user 28.680000 sys 0.136000
Binder Error:
First row must contain only strings when has_header is set to true
D
D select count(*) from sheetreader('pynyc.xlsx',has_header=0);
┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000000
(1.00 million)
└────────────────┘
Run Time (s): real 17.470 user 55.900000 sys 0.160000
D select count(*) from sheetreader('ducknyc.xlsx',has_header=0);
Run Time (s): real 10.124 user 44.872000 sys 0.536000
Invalid Error:
Failed to retrieve shared strings file
D select count(*) from sheetreader('cnyc.xlsx',has_header=0);
┌────────────────┐
│ count_star()
│ int64 │
├────────────────┤
1000217
(1.00 million)
└────────────────┘
Run Time (s): real 13.244 user 42.060000 sys 0.184000

简单地数方框就知道rusty_sheet插件兼容性最好,sheetreader处理shared strings格式效率最高,它也处理不了别的格式。并行度大约是4。
DuckDB官方excel插件在全字符选项下兼容两种格式,奇怪的是带标题判断,它连自己生成的文件格式都判断错误。

对于识别的结果,我们用summarize命令来查看.
作为基准的duckdb识别原始csv文件

D create table t as from read_csv('NYC_311_SR_2010-2020-sample-1M.csv.mp3');
Run Time (s): real 11.214 user 19.248000 sys 1.760000
D summarize t;
┌──────────────────────┬─────────────┬──────────────────────┬──────────────────────┬───────────────┬───┬──────────────────────┬──────────────────────┬─────────┬─────────────────┐
│ column_name │ column_type │ min │ max │ approx_unique │ … │ q50 │ q75 │ count │ null_percentage │
varcharvarcharvarcharvarchar │ int64 │ │ varcharvarchar │ int64 │ decimal(9,2)
├──────────────────────┼─────────────┼──────────────────────┼──────────────────────┼───────────────┼───┼──────────────────────┼──────────────────────┼─────────┼─────────────────┤
Unique KeyBIGINT11465364484781731022792 │ … │ 328556514020143510000000.00
│ Created DateTIMESTAMP20100101 00:00:0020201223 01:25:51717832 │ … │ 20160211 14:40:2… │ 20180731 08:55:5… │ 10000000.00
│ Closed DateTIMESTAMP19000101 00:00:0021000101 00:00:00619261 │ … │ 20160226 00:08:4… │ 20180802 13:09:5… │ 10000002.86
│ Agency │ VARCHAR311 │ TLC │ 29 │ … │ NULLNULL10000000.00
│ Agency Name │ VARCHAR311 │ Valuation Policy │ 686 │ … │ NULLNULL10000000.00
│ Complaint TypeVARCHAR../../WEBINF/web.… │ ZTESTINT │ 281 │ … │ NULLNULL10000000.00
│ Descriptor │ VARCHAR1 Missed Collection │ unknown odor/taste… │ 1553 │ … │ NULLNULL10000000.30
...

然后用不同插件读取大家都识别正确的pynyc.xlsx。
duckdb官方excel插件全识别为varchar

D create table t as from read_xlsx('pynyc.xlsx',header=0);
D summarize t;
┌─────────────┬─────────────┬──────────────────────┬────────────────────────────────────────────────────┬───────────────┬───────┬───────┬───────┬───────┬───────┬─────────┬─────────────────┐
│ column_name │ column_type │ min │ max │ approx_unique │ avg │ std │ q25 │ q50 │ q75 │ count │ null_percentage │
varcharvarcharvarcharvarchar │ int64 │ int32 │ int32 │ int32 │ int32 │ int32 │ int64 │ decimal(9,2)
├─────────────┼─────────────┼──────────────────────┼────────────────────────────────────────────────────┼───────────────┼───────┼───────┼───────┼───────┼───────┼─────────┼─────────────────┤
│ A1 │ VARCHAR11465364Unique Key857267NULLNULLNULLNULLNULL10000010.00
│ B1 │ VARCHAR01/01/2010 01:05:5… │ Created Date920156NULLNULLNULLNULLNULL10000010.00
│ C1 │ VARCHAR │ │ Closed Date667996NULLNULLNULLNULLNULL10000010.00
│ D1 │ VARCHAR311 │ TLC │ 29NULLNULLNULLNULLNULL10000010.00
│ E1 │ VARCHAR311 │ Valuation Policy │ 686NULLNULLNULLNULLNULL10000010.00
│ F1 │ VARCHAR../../WEBINF/web.… │ ZTESTINT │ 281NULLNULLNULLNULLNULL10000010.00
│ G1 │ VARCHAR │ │ unknown odor/taste in drinking water (QA6)1553NULLNULLNULLNULLNULL10000010.00
│ H1 │ VARCHAR │ │ Wooded Area │ 149NULLNULLNULLNULLNULL10000010.00
│ I1 │ VARCHAR │ │ XXXXX │ 560NULLNULLNULLNULLNULL10000010.00
...

sheetreader只有double和varchar两种格式

D create table t as from sheetreader('pynyc.xlsx');
Run Time (s): real 22.319 user 62.448000 sys 2.436000
D summarize t;
┌──────────────────────┬─────────────┬──────────────────────┬──────────────────────┬───────────────┬───┬────────────────────┬────────────────────┬─────────┬─────────────────┐
│ column_name │ column_type │ min │ max │ approx_unique │ … │ q50 │ q75 │ count │ null_percentage │
varcharvarcharvarcharvarchar │ int64 │ │ varcharvarchar │ int64 │ decimal(9,2)
├──────────────────────┼─────────────┼──────────────────────┼──────────────────────┼───────────────┼───┼────────────────────┼────────────────────┼─────────┼─────────────────┤
Unique KeyDOUBLE11465364.048478173.0972278 │ … │ 32847889.4505127440204486.5782267310000000.00
│ Created DateVARCHAR01/01/2010 01:05:5… │ 12/31/2019 12:58:5… │ 815306 │ … │ NULLNULL10000000.00
│ Closed DateVARCHAR │ │ 12/31/2019 12:59:0… │ 736431 │ … │ NULLNULL10000000.00
│ Agency │ VARCHAR311 │ TLC │ 27 │ … │ NULLNULL10000000.00
│ Agency Name │ VARCHAR311 │ Valuation Policy │ 476 │ … │ NULLNULL10000000.00
│ Complaint TypeVARCHAR../../WEBINF/web.… │ ZTESTINT │ 247 │ … │ NULLNULL10000000.00
│ Descriptor │ VARCHAR │ │ unknown odor/taste… │ 1296 │ … │ NULLNULL10000000.00
│ Location TypeVARCHAR │ │ Wooded Area │ 147 │ … │ NULLNULL10000000.00
│ Incident Zip │ DOUBLE0.0900836.0438 │ … │ 11204.5187783145711234.27829456380610000005.51
...

rusty_sheet基本还原了整数类型,但日期型识别为字符串。

D load rusty_sheet;
D create table t2 as from read_sheet('pynyc.xlsx',header=1);
D summarize t2;
┌──────────────────────┬─────────────┬──────────────────────┬──────────────────────┬───────────────┬────────────────────┬───┬──────────┬──────────┬──────────┬─────────┬─────────────────┐
│ column_name │ column_type │ min │ max │ approx_unique │ avg │ … │ q25 │ q50 │ q75 │ count │ null_percentage │
varcharvarcharvarcharvarchar │ int64 │ varchar │ │ varcharvarcharvarchar │ int64 │ decimal(9,2)
├──────────────────────┼─────────────┼──────────────────────┼──────────────────────┼───────────────┼────────────────────┼───┼──────────┼──────────┼──────────┼─────────┼─────────────────┤
Unique KeyBIGINT1146536448478173102279232687965.858032 │ … │ 25252380328492874020484210000000.00
│ Created DateVARCHAR01/01/2010 01:05:5… │ 12/31/2019 12:58:5… │ 920156NULL │ … │ NULLNULLNULL10000000.00
│ Closed DateVARCHAR │ │ 12/31/2019 12:59:0… │ 667996NULL │ … │ NULLNULLNULL10000000.00
│ Agency │ VARCHAR311 │ TLC │ 29NULL │ … │ NULLNULLNULL10000000.00
│ Agency Name │ VARCHAR311 │ Valuation Policy │ 686NULL │ … │ NULLNULLNULL10000000.00
│ Complaint TypeVARCHAR../../WEBINF/web.… │ ZTESTINT │ 281NULL │ … │ NULLNULLNULL10000000.00
│ Descriptor │ VARCHAR │ │ unknown odor/taste… │ 1553NULL │ … │ NULLNULLNULL10000000.00
│ Location TypeVARCHAR │ │ Wooded Area │ 149NULL │ … │ NULLNULLNULL10000000.00
│ Incident Zip │ BIGINT090083654810825.273804119031 │ … │ 10337112041123410000005.51
│ Incident Address │ VARCHAR │ │ west 155 street an… │ 333521NULL │ … │ NULLNULLNULL10000000.00
...

以上仅是对一个文件的测试,不代表其他文件的情况。各种插件都有指定列类型的功能,加上后应该能取得更好的效果。

赞(0)
未经允许不得转载:网硕互联帮助中心 » 不同DuckDB插件对不同工具生成的xlsx文件读取速度的比较
分享到: 更多 (0)

评论 抢沙发

评论前必须登录!