Thanks to visit codestin.com
Credit goes to github.com

Skip to content
23 changes: 23 additions & 0 deletions data/csv/auto/early_out_error.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
a,b,c
1,1,1
1,1,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,\n,1
1,1,1
1,1,1
5 changes: 5 additions & 0 deletions src/common/enum_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,8 @@ const char* EnumUtil::ToChars<CSVState>(CSVState value) {
return "EMPTY_SPACE";
case CSVState::COMMENT:
return "COMMENT";
case CSVState::STANDARD_NEWLINE:
return "STANDARD_NEWLINE";
default:
throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented in ToChars<CSVState>", value));
}
Expand Down Expand Up @@ -876,6 +878,9 @@ CSVState EnumUtil::FromString<CSVState>(const char *value) {
if (StringUtil::Equals(value, "COMMENT")) {
return CSVState::COMMENT;
}
if (StringUtil::Equals(value, "STANDARD_NEWLINE")) {
return CSVState::STANDARD_NEWLINE;
}
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented in FromString<CSVState>", value));
}

Expand Down
169 changes: 101 additions & 68 deletions src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1404,34 +1404,19 @@ bool StringValueResult::PrintErrorLine() const {
(state_machine.options.store_rejects.GetValue() || !state_machine.options.ignore_errors.GetValue());
}

void StringValueScanner::SkipUntilNewLine() {
// Now skip until next newline
if (state_machine->options.dialect_options.state_machine_options.new_line.GetValue() ==
NewLineIdentifier::CARRY_ON) {
bool carriage_return = false;
bool not_carriage_return = false;
for (; iterator.pos.buffer_pos < cur_buffer_handle->actual_size; iterator.pos.buffer_pos++) {
if (buffer_handle_ptr[iterator.pos.buffer_pos] == '\r') {
carriage_return = true;
} else if (buffer_handle_ptr[iterator.pos.buffer_pos] != '\n') {
not_carriage_return = true;
}
if (buffer_handle_ptr[iterator.pos.buffer_pos] == '\n') {
if (carriage_return || not_carriage_return) {
iterator.pos.buffer_pos++;
return;
}
}
bool StringValueScanner::SkipUntilState(CSVState initial_state, CSVState until_state) {
CSVStates current_state;
current_state.Initialize(initial_state);
while (iterator.pos.buffer_pos < cur_buffer_handle->actual_size) {
state_machine->Transition(current_state, buffer_handle_ptr[iterator.pos.buffer_pos++]);
if (current_state.IsState(until_state)) {
return true;
}
} else {
for (; iterator.pos.buffer_pos < cur_buffer_handle->actual_size; iterator.pos.buffer_pos++) {
if (buffer_handle_ptr[iterator.pos.buffer_pos] == '\n' ||
buffer_handle_ptr[iterator.pos.buffer_pos] == '\r') {
iterator.pos.buffer_pos++;
return;
}
if (current_state.IsState(CSVState::INVALID)) {
return false;
}
}
return true;
}

bool StringValueScanner::CanDirectlyCast(const LogicalType &type, bool icu_loaded) {
Expand Down Expand Up @@ -1463,6 +1448,63 @@ bool StringValueScanner::CanDirectlyCast(const LogicalType &type, bool icu_loade
}
}

bool StringValueScanner::IsRowValid() {
if (iterator.pos.buffer_pos == cur_buffer_handle->actual_size) {
return false;
}
constexpr idx_t result_size = 1;
auto scan_finder =
make_uniq<StringValueScanner>(0U, buffer_manager, state_machine, make_shared_ptr<CSVErrorHandler>(),
csv_file_scan, false, iterator, result_size);
auto &tuples = scan_finder->ParseChunk();
return tuples.number_of_rows == 1 && tuples.borked_rows.empty();
}

void StringValueScanner::TryRow(CSVState state, idx_t &start_pos, idx_t &end_pos, bool &valid) {
idx_t initial_pos = iterator.pos.buffer_pos;
if (SkipUntilState(state, CSVState::RECORD_SEPARATOR)) {
idx_t current_pos = iterator.pos.buffer_pos;
if (IsRowValid()) {
valid = true;
start_pos = std::min(start_pos, current_pos);
}
end_pos = std::max(end_pos, iterator.pos.buffer_pos);
}
// reset buffer
iterator.pos.buffer_pos = initial_pos;
}

idx_t StringValueScanner::FindNextNewLine() const {
idx_t cur_pos = iterator.pos.buffer_pos;
// Now skip until next newline
if (state_machine->options.dialect_options.state_machine_options.new_line.GetValue() ==
NewLineIdentifier::CARRY_ON) {
bool carriage_return = false;
bool not_carriage_return = false;
for (; cur_pos < cur_buffer_handle->actual_size; cur_pos++) {
if (buffer_handle_ptr[cur_pos] == '\r') {
carriage_return = true;
} else if (buffer_handle_ptr[cur_pos] != '\n') {
not_carriage_return = true;
}
if (buffer_handle_ptr[cur_pos] == '\n') {
if (carriage_return || not_carriage_return) {
cur_pos++;
return cur_pos;
}
}
}
} else {
for (; cur_pos < cur_buffer_handle->actual_size; cur_pos++) {
if (buffer_handle_ptr[cur_pos] == '\n' || buffer_handle_ptr[cur_pos] == '\r') {
cur_pos++;
return cur_pos;
}
}
}
return cur_pos;
}

void StringValueScanner::SetStart() {
if (iterator.first_one) {
if (result.store_line_size) {
Expand All @@ -1472,50 +1514,41 @@ void StringValueScanner::SetStart() {
}
// The result size of the data after skipping the row is one line
// We have to look for a new line that fits our schema
// 1. We walk until the next new line
bool line_found;
unique_ptr<StringValueScanner> scan_finder;
do {
constexpr idx_t result_size = 1;
SkipUntilNewLine();
if (state_machine->options.null_padding) {
// When Null Padding, we assume we start from the correct new-line
return;
}
scan_finder =
make_uniq<StringValueScanner>(0U, buffer_manager, state_machine, make_shared_ptr<CSVErrorHandler>(true),
csv_file_scan, false, iterator, result_size);
auto &tuples = scan_finder->ParseChunk();
line_found = true;
if (tuples.number_of_rows != 1 ||
(!tuples.borked_rows.empty() && !state_machine->options.ignore_errors.GetValue()) ||
tuples.first_line_is_comment) {
line_found = false;
// If no tuples were parsed, this is not the correct start, we need to skip until the next new line
// Or if columns don't match, this is not the correct start, we need to skip until the next new line
if (scan_finder->previous_buffer_handle) {
if (scan_finder->iterator.pos.buffer_pos >= scan_finder->previous_buffer_handle->actual_size &&
scan_finder->previous_buffer_handle->is_last_buffer) {
iterator.pos.buffer_idx = scan_finder->iterator.pos.buffer_idx;
iterator.pos.buffer_pos = scan_finder->iterator.pos.buffer_pos;
result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size};
iterator.done = scan_finder->iterator.done;
return;
}
}
if (iterator.pos.buffer_pos == cur_buffer_handle->actual_size ||
scan_finder->iterator.GetBufferIdx() > iterator.GetBufferIdx()) {
// If things go terribly wrong, we never loop indefinitely.
iterator.pos.buffer_idx = scan_finder->iterator.pos.buffer_idx;
iterator.pos.buffer_pos = scan_finder->iterator.pos.buffer_pos;
result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size};
iterator.done = scan_finder->iterator.done;
return;
}
idx_t next_new_line = FindNextNewLine();
idx_t potential_start = cur_buffer_handle->actual_size;
idx_t largest_end_pos = 0;
bool any_valid_row = false;
if (state_machine->options.null_padding) {
// When Null Padding, we assume we start from the correct new-line
return;
}
// At this point we have 3 options:
// 1. We are at the start of a valid line
TryRow(CSVState::STANDARD_NEWLINE, potential_start, largest_end_pos, any_valid_row);
// 2. We are in the middle of a quoted value
if (potential_start > next_new_line &&
state_machine->dialect_options.state_machine_options.quote.GetValue() != '\0') {
TryRow(CSVState::QUOTED, potential_start, largest_end_pos, any_valid_row);
}
// 3. We are in an escaped value
if (!any_valid_row && potential_start > next_new_line &&
state_machine->dialect_options.state_machine_options.escape.GetValue() != '\0') {
TryRow(CSVState::ESCAPE, potential_start, largest_end_pos, any_valid_row);
}
if (!any_valid_row) {
bool is_this_the_end = largest_end_pos == cur_buffer_handle->actual_size && cur_buffer_handle->is_last_buffer;
if (is_this_the_end) {
iterator.pos.buffer_pos = largest_end_pos;
iterator.done = true;
} else {
SkipUntilState(CSVState::STANDARD_NEWLINE, CSVState::RECORD_SEPARATOR);
}
} while (!line_found);
iterator.pos.buffer_idx = scan_finder->result.current_line_position.begin.buffer_idx;
iterator.pos.buffer_pos = scan_finder->result.current_line_position.begin.buffer_pos;
} else {
iterator.pos.buffer_pos = potential_start;
iterator.done = iterator.pos.buffer_pos == cur_buffer_handle->actual_size;
}
// 4. We have an error, if we have an error, we let life go on, the scanner will either ignore it
// or throw.
result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,20 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op

// Now set values depending on configuration
// 1) Standard/Invalid State
vector<uint8_t> std_inv {static_cast<uint8_t>(CSVState::STANDARD), static_cast<uint8_t>(CSVState::INVALID)};
vector<uint8_t> std_inv {static_cast<uint8_t>(CSVState::STANDARD), static_cast<uint8_t>(CSVState::INVALID),
static_cast<uint8_t>(CSVState::STANDARD_NEWLINE)};
for (auto &state : std_inv) {
transition_array[delimiter][state] = CSVState::DELIMITER;
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::RECORD_SEPARATOR;
if (new_line_id == NewLineIdentifier::CARRY_ON) {
transition_array[static_cast<uint8_t>('\r')][state] = CSVState::CARRIAGE_RETURN;
if (state == static_cast<uint8_t>(CSVState::STANDARD_NEWLINE)) {
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::STANDARD;
} else {
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::INVALID;
}
} else {
transition_array[static_cast<uint8_t>('\r')][state] = CSVState::RECORD_SEPARATOR;
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::RECORD_SEPARATOR;
}
if (comment != '\0') {
transition_array[comment][state] = CSVState::COMMENT;
Expand Down
25 changes: 13 additions & 12 deletions src/include/duckdb/execution/operator/csv_scanner/csv_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,19 @@ namespace duckdb {

//! All States of CSV Parsing
enum class CSVState : uint8_t {
STANDARD = 0, //! Regular unquoted field state
DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
QUOTED = 4, //! State when inside a quoted field
UNQUOTED = 5, //! State when leaving a quoted field
ESCAPE = 6, //! State when encountering an escape character (e.g., \)
INVALID = 7, //! Got to an Invalid State, this should error.
NOT_SET = 8, //! If the state is not set, usually the first state before getting the first character
QUOTED_NEW_LINE = 9, //! If we have a quoted newline
EMPTY_SPACE = 10, //! If we have empty spaces in the beginning and end of value
COMMENT = 11 //! If we are in a comment state, and hence have to skip the whole line
STANDARD = 0, //! Regular unquoted field state
DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
QUOTED = 4, //! State when inside a quoted field
UNQUOTED = 5, //! State when leaving a quoted field
ESCAPE = 6, //! State when encountering an escape character (e.g., \)
INVALID = 7, //! Got to an Invalid State, this should error.
NOT_SET = 8, //! If the state is not set, usually the first state before getting the first character
QUOTED_NEW_LINE = 9, //! If we have a quoted newline
EMPTY_SPACE = 10, //! If we have empty spaces in the beginning and end of value
COMMENT = 11, //! If we are in a comment state, and hence have to skip the whole line
STANDARD_NEWLINE = 12, //! State used for figuring out a new line.
};

} // namespace duckdb
Original file line number Diff line number Diff line change
Expand Up @@ -17,65 +17,68 @@ namespace duckdb {
//! State of necessary CSV States to parse file
//! Current, previous, and state before the previous
struct CSVStates {
void Initialize() {
states[0] = CSVState::NOT_SET;
states[1] = CSVState::NOT_SET;
void Initialize(CSVState initial_state = CSVState::NOT_SET) {
states[0] = initial_state;
states[1] = initial_state;
}
inline bool NewValue() {
inline bool NewValue() const {
return states[1] == CSVState::DELIMITER;
}

inline bool NewRow() {
inline bool NewRow() const {
// It is a new row, if the previous state is not a record separator, and the current one is
return states[0] != CSVState::RECORD_SEPARATOR && states[0] != CSVState::CARRIAGE_RETURN &&
(states[1] == CSVState::RECORD_SEPARATOR || states[1] == CSVState::CARRIAGE_RETURN);
}

inline bool WasStandard() {
inline bool WasStandard() const {
return states[0] == CSVState::STANDARD;
}

inline bool EmptyLastValue() {
inline bool EmptyLastValue() const {
// It is a new row, if the previous state is not a record separator, and the current one is
return states[0] == CSVState::DELIMITER &&
(states[1] == CSVState::RECORD_SEPARATOR || states[1] == CSVState::CARRIAGE_RETURN ||
states[1] == CSVState::DELIMITER);
}

inline bool EmptyLine() {
inline bool EmptyLine() const {
return (states[1] == CSVState::CARRIAGE_RETURN || states[1] == CSVState::RECORD_SEPARATOR) &&
(states[0] == CSVState::RECORD_SEPARATOR || states[0] == CSVState::NOT_SET);
}

inline bool IsNotSet() {
inline bool IsNotSet() const {
return states[1] == CSVState::NOT_SET;
}

inline bool IsComment() {
inline bool IsComment() const {
return states[1] == CSVState::COMMENT;
}

inline bool IsCurrentNewRow() {
inline bool IsCurrentNewRow() const {
return states[1] == CSVState::RECORD_SEPARATOR || states[1] == CSVState::CARRIAGE_RETURN;
}

inline bool IsCarriageReturn() {
inline bool IsCarriageReturn() const {
return states[1] == CSVState::CARRIAGE_RETURN;
}

inline bool IsInvalid() {
inline bool IsInvalid() const {
return states[1] == CSVState::INVALID;
}

inline bool IsQuoted() {
inline bool IsQuoted() const {
return states[0] == CSVState::QUOTED;
}
inline bool IsEscaped() {
inline bool IsEscaped() const {
return states[1] == CSVState::ESCAPE || (states[0] == CSVState::UNQUOTED && states[1] == CSVState::QUOTED);
}
inline bool IsQuotedCurrent() {
inline bool IsQuotedCurrent() const {
return states[1] == CSVState::QUOTED || states[1] == CSVState::QUOTED_NEW_LINE;
}
inline bool IsState(const CSVState state) const {
return states[1] == state;
}
CSVState states[2];
};

Expand All @@ -98,7 +101,7 @@ class CSVStateMachine {
states.states[1] = transition_array[static_cast<uint8_t>(current_char)][static_cast<uint8_t>(states.states[1])];
}

void Print() {
void Print() const {
std::cout << "State Machine Options" << '\n';
std::cout << "Delim: " << state_machine_options.delimiter.GetValue() << '\n';
std::cout << "Quote: " << state_machine_options.quote.GetValue() << '\n';
Expand Down
Loading
Loading