-
-
Notifications
You must be signed in to change notification settings - Fork 755
Open
Description
Note: If it is a 'bug report' or 'feature request', please select the corresponding issue template, otherwise this issue will be not replied.
Describe the problem details
the tbox xml_reader is slower than expat, how to speed up it
I use following code to save xml to csv.
#include "../demo.h"
#include <stdio.h>
tb_int_t main(tb_int_t argc, tb_char_t** argv)
{
if (!tb_init(tb_null, tb_null)) return -1;
// 检查参数
if (argc < 2) {
tb_printf("用法: %s <xml文件> [xpath]\n", argv[0]);
return -1;
}
// 初始化CSV文件
FILE* csv_file = fopen("output.csv", "w");
if (!csv_file) {
tb_printf("无法创建CSV文件\n");
return -1;
}
// 写入CSV表头
fprintf(csv_file, "Row,");
for (char col = 'A'; col <= 'Z'; col++) {
fprintf(csv_file, "%c", col);
if (col < 'Z') fprintf(csv_file, ",");
}
fprintf(csv_file, "\n");
// 初始化reader
tb_xml_reader_ref_t reader = tb_xml_reader_init();
if (reader)
{
// 打开reader
if (tb_xml_reader_open(reader, tb_stream_init_from_url(argv[1]), tb_true))
{
// 跳转到指定路径(如果有)
tb_bool_t ok = tb_true;
if (argv[2]) ok = tb_xml_reader_goto(reader, argv[2]);
// 解析状态变量
tb_int_t current_row = -1;
tb_char_t current_col = 'A';
tb_bool_t in_row = tb_false;
tb_bool_t in_cell = tb_false;
tb_bool_t is_inline_str = tb_false;
tb_bool_t has_cell_data = tb_false;
tb_char_t cell_value[256] = {0};
// 当前行的列数据
tb_char_t row_data[26][256] = {{0}};
tb_bool_t row_has_data = tb_false;
// 遍历XML事件
tb_size_t event = TB_XML_READER_EVENT_NONE;
while (ok && (event = tb_xml_reader_next(reader)))
{
switch (event)
{
case TB_XML_READER_EVENT_ELEMENT_BEG:
{
tb_char_t const* name = tb_xml_reader_element(reader);
if (tb_strcmp(name, "row") == 0)
{//printf("tb_strcmp(name, row) == 0");
// 开始新行
in_row = tb_true;
row_has_data = tb_false;
tb_memset(row_data, 0, sizeof(row_data));
// 获取行号
tb_xml_node_ref_t attr = tb_xml_reader_attributes(reader);
for (; attr; attr = attr->next)
{
if (tb_strcmp(tb_string_cstr(&attr->name), "r") == 0)
{
current_row = tb_atoi(tb_string_cstr(&attr->data));//printf("row%d\n",current_row);
break;
}
}
}
else if (tb_strcmp(name, "c") == 0 && in_row)
{
// 开始单元格
in_cell = tb_true;
is_inline_str = tb_false;
tb_memset(cell_value, 0, sizeof(cell_value));
// 获取列位置
tb_xml_node_ref_t attr = tb_xml_reader_attributes(reader);
for (; attr; attr = attr->next)
{
if (tb_strcmp(tb_string_cstr(&attr->name), "r") == 0)
{
const tb_char_t* r_value = tb_string_cstr(&attr->data);
current_col = r_value[0]; // 获取列字母
//printf("col%c ",current_col);
}
else if (tb_strcmp(tb_string_cstr(&attr->name), "t") == 0)
{
if (tb_strcmp(tb_string_cstr(&attr->data), "inlineStr") == 0)
{
is_inline_str = tb_true;
}
}
}
}
else if ((tb_strcmp(name, "v") == 0 || tb_strcmp(name, "t") == 0) && in_cell)
{
// 准备读取单元格值
has_cell_data = tb_true;
}
}
break;
case TB_XML_READER_EVENT_TEXT:
if (has_cell_data)
{
// 读取单元格值
tb_strncpy(cell_value, tb_xml_reader_text(reader), sizeof(cell_value) - 1); //printf("c%s ",cell_value);
}
break;
case TB_XML_READER_EVENT_ELEMENT_END:
{
tb_char_t const* name = tb_xml_reader_element(reader);
if (tb_strcmp(name, "row") == 0)
{//printf("结束行 ");
// 结束行,输出到CSV
if (row_has_data)
{//printf("结束行 ");
fprintf(csv_file, "%d,", current_row);
for (tb_int_t i = 0; i < 26; i++)
{
if (row_data[i][0] != '\0')
{
fprintf(csv_file, "%s", row_data[i]);
}
if (i < 25) fprintf(csv_file, ",");
}
fprintf(csv_file, "\n");
}
in_row = tb_false;
}
else if (tb_strcmp(name, "c") == 0 && in_row)
{//printf("结束单元格 ");
// 结束单元格,保存数据
if (/*has_cell_data &&*/ cell_value[0] != '\0')
{//printf("结束单元格 ");
tb_int_t col_index = current_col - 'A';
if (col_index >= 0 && col_index < 26)
{
tb_strncpy(row_data[col_index], cell_value, sizeof(row_data[col_index]) - 1);//printf("d%s ",row_data[col_index]);
row_has_data = tb_true;
}
}
in_cell = tb_false;
has_cell_data = tb_false;
}
else if ((tb_strcmp(name, "v") == 0 || tb_strcmp(name, "t") == 0) && in_cell)
{
has_cell_data = tb_false;
}
}
break;
default:
break;
}
}
}
// 清理reader
tb_xml_reader_exit(reader);
}
fclose(csv_file);
tb_printf("CSV文件已生成: output.csv\n");
return 0;
}gcc src/demo/xml/tocsv.c -o tocsv -I build/linux/arm64/release build/linux/arm64/release/libtbox.a -lm -O3
time ./tocsv /par/lineitem/xl/worksheets/sheet1.xml
CSV文件已生成: output.csv
real 12m32.261s
user 3m13.624s
sys 9m17.076sthe expat version takes about 20s to finish.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels