Thanks to visit codestin.com
Credit goes to GitHub.com

Skip to content

the tbox xml_reader is slower than expat, how to speed it up #284

@l1t1

Description

@l1t1

Note: If it is a 'bug report' or 'feature request', please select the corresponding issue template, otherwise this issue will be not replied.

Describe the problem details

the tbox xml_reader is slower than expat, how to speed up it
I use following code to save xml to csv.

#include "../demo.h"
#include <stdio.h>

tb_int_t main(tb_int_t argc, tb_char_t** argv)
{
    if (!tb_init(tb_null, tb_null)) return -1;
    
    // 检查参数
    if (argc < 2) {
        tb_printf("用法: %s <xml文件> [xpath]\n", argv[0]);
        return -1;
    }

    // 初始化CSV文件
    FILE* csv_file = fopen("output.csv", "w");
    if (!csv_file) {
        tb_printf("无法创建CSV文件\n");
        return -1;
    }

    // 写入CSV表头
    fprintf(csv_file, "Row,");
    for (char col = 'A'; col <= 'Z'; col++) {
        fprintf(csv_file, "%c", col);
        if (col < 'Z') fprintf(csv_file, ",");
    }
    fprintf(csv_file, "\n");

    // 初始化reader
    tb_xml_reader_ref_t reader = tb_xml_reader_init();
    if (reader)
    {
        // 打开reader
        if (tb_xml_reader_open(reader, tb_stream_init_from_url(argv[1]), tb_true))
        {
            // 跳转到指定路径(如果有)
            tb_bool_t ok = tb_true;
            if (argv[2]) ok = tb_xml_reader_goto(reader, argv[2]);

            // 解析状态变量
            tb_int_t current_row = -1;
            tb_char_t current_col = 'A';
            tb_bool_t in_row = tb_false;
            tb_bool_t in_cell = tb_false;
            tb_bool_t is_inline_str = tb_false;
            tb_bool_t has_cell_data = tb_false;
            tb_char_t cell_value[256] = {0};
            
            // 当前行的列数据
            tb_char_t row_data[26][256] = {{0}};
            tb_bool_t row_has_data = tb_false;

            // 遍历XML事件
            tb_size_t event = TB_XML_READER_EVENT_NONE;
            while (ok && (event = tb_xml_reader_next(reader)))
            {
                switch (event)
                {
                case TB_XML_READER_EVENT_ELEMENT_BEG:
                    {
                        tb_char_t const* name = tb_xml_reader_element(reader);
                        
                        if (tb_strcmp(name, "row") == 0)
                        {//printf("tb_strcmp(name, row) == 0");
                            // 开始新行
                            in_row = tb_true;
                            row_has_data = tb_false;
                            tb_memset(row_data, 0, sizeof(row_data));
                            
                            // 获取行号
                            tb_xml_node_ref_t attr = tb_xml_reader_attributes(reader);
                            for (; attr; attr = attr->next)
                            {
                                if (tb_strcmp(tb_string_cstr(&attr->name), "r") == 0)
                                {
                                    current_row = tb_atoi(tb_string_cstr(&attr->data));//printf("row%d\n",current_row);
                                    break;
                                }
                            }
                        }
                        else if (tb_strcmp(name, "c") == 0 && in_row)
                        {
                            // 开始单元格
                            in_cell = tb_true;
                            is_inline_str = tb_false;
                            tb_memset(cell_value, 0, sizeof(cell_value));
                            
                            // 获取列位置
                            tb_xml_node_ref_t attr = tb_xml_reader_attributes(reader);
                            for (; attr; attr = attr->next)
                            {
                                if (tb_strcmp(tb_string_cstr(&attr->name), "r") == 0)
                                {
                                    const tb_char_t* r_value = tb_string_cstr(&attr->data);
                                    current_col = r_value[0]; // 获取列字母
                                    //printf("col%c ",current_col);
                                }
                                else if (tb_strcmp(tb_string_cstr(&attr->name), "t") == 0)
                                {
                                    if (tb_strcmp(tb_string_cstr(&attr->data), "inlineStr") == 0)
                                    {
                                        is_inline_str = tb_true;
                                    }
                                }
                            }
                        }
                        else if ((tb_strcmp(name, "v") == 0 || tb_strcmp(name, "t") == 0) && in_cell)
                        {
                            // 准备读取单元格值
                            has_cell_data = tb_true;
                        }
                    }
                    break;
                    
                case TB_XML_READER_EVENT_TEXT:
                    if (has_cell_data)
                    {
                        // 读取单元格值
                        tb_strncpy(cell_value, tb_xml_reader_text(reader), sizeof(cell_value) - 1); //printf("c%s ",cell_value);
                    }
                    break;
                    
                case TB_XML_READER_EVENT_ELEMENT_END:
                    {
                        tb_char_t const* name = tb_xml_reader_element(reader);
                        
                        if (tb_strcmp(name, "row") == 0)
                        {//printf("结束行 ");
                            // 结束行,输出到CSV
                            if (row_has_data)
                            {//printf("结束行 ");
                                fprintf(csv_file, "%d,", current_row);
                                for (tb_int_t i = 0; i < 26; i++)
                                {
                                    if (row_data[i][0] != '\0')
                                    {
                                        fprintf(csv_file, "%s", row_data[i]);
                                    }
                                    if (i < 25) fprintf(csv_file, ",");
                                }
                                fprintf(csv_file, "\n");
                            }
                            in_row = tb_false;
                        }
                        else if (tb_strcmp(name, "c") == 0 && in_row)
                        {//printf("结束单元格 ");
                            // 结束单元格,保存数据
                            if (/*has_cell_data &&*/ cell_value[0] != '\0')
                            {//printf("结束单元格 ");
                                tb_int_t col_index = current_col - 'A';
                                if (col_index >= 0 && col_index < 26)
                                {
                                    tb_strncpy(row_data[col_index], cell_value, sizeof(row_data[col_index]) - 1);//printf("d%s ",row_data[col_index]);
                                    row_has_data = tb_true;
                                }
                            }
                            in_cell = tb_false;
                            has_cell_data = tb_false;
                        }
                        else if ((tb_strcmp(name, "v") == 0 || tb_strcmp(name, "t") == 0) && in_cell)
                        {
                            has_cell_data = tb_false;
                        }
                    }
                    break;
                    
                default:
                    break;
                }
            }
        }

        // 清理reader
        tb_xml_reader_exit(reader);
    }

    fclose(csv_file);
    tb_printf("CSV文件已生成: output.csv\n");
    return 0;
}
gcc src/demo/xml/tocsv.c -o tocsv -I build/linux/arm64/release build/linux/arm64/release/libtbox.a -lm -O3
time ./tocsv /par/lineitem/xl/worksheets/sheet1.xml
CSV文件已生成: output.csv

real	12m32.261s
user	3m13.624s
sys	9m17.076s

the expat version takes about 20s to finish.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions