duckdb · Mytherin · Oct 8, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 2, 2024
diff --git a/data/csv/auto/early_out_error.csv b/data/csv/auto/early_out_error.csv
@@ -0,0 +1,23 @@
+a,b,c
+1,1,1
+1,1,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,\n,1
+1,1,1
+1,1,1
diff --git a/src/common/enum_util.cpp b/src/common/enum_util.cpp
@@ -833,6 +833,8 @@ const char* EnumUtil::ToChars<CSVState>(CSVState value) {
 		return "EMPTY_SPACE";
 	case CSVState::COMMENT:
 		return "COMMENT";
+	case CSVState::STANDARD_NEWLINE:
+		return "STANDARD_NEWLINE";
 	default:
 		throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented in ToChars<CSVState>", value));
 	}
@@ -876,6 +878,9 @@ CSVState EnumUtil::FromString<CSVState>(const char *value) {
 	if (StringUtil::Equals(value, "COMMENT")) {
 		return CSVState::COMMENT;
 	}
+	if (StringUtil::Equals(value, "STANDARD_NEWLINE")) {
+		return CSVState::STANDARD_NEWLINE;
+	}
 	throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented in FromString<CSVState>", value));
 }
 

diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp
@@ -1404,34 +1404,19 @@ bool StringValueResult::PrintErrorLine() const {
 	       (state_machine.options.store_rejects.GetValue() || !state_machine.options.ignore_errors.GetValue());
 }
 
-void StringValueScanner::SkipUntilNewLine() {
-	// Now skip until next newline
-	if (state_machine->options.dialect_options.state_machine_options.new_line.GetValue() ==
-	    NewLineIdentifier::CARRY_ON) {
-		bool carriage_return = false;
-		bool not_carriage_return = false;
-		for (; iterator.pos.buffer_pos < cur_buffer_handle->actual_size; iterator.pos.buffer_pos++) {
-			if (buffer_handle_ptr[iterator.pos.buffer_pos] == '\r') {
-				carriage_return = true;
-			} else if (buffer_handle_ptr[iterator.pos.buffer_pos] != '\n') {
-				not_carriage_return = true;
-			}
-			if (buffer_handle_ptr[iterator.pos.buffer_pos] == '\n') {
-				if (carriage_return || not_carriage_return) {
-					iterator.pos.buffer_pos++;
-					return;
-				}
-			}
+bool StringValueScanner::SkipUntilState(CSVState initial_state, CSVState until_state) {
+	CSVStates current_state;
+	current_state.Initialize(initial_state);
+	while (iterator.pos.buffer_pos < cur_buffer_handle->actual_size) {
+		state_machine->Transition(current_state, buffer_handle_ptr[iterator.pos.buffer_pos++]);
+		if (current_state.IsState(until_state)) {
+			return true;
 		}
-	} else {
-		for (; iterator.pos.buffer_pos < cur_buffer_handle->actual_size; iterator.pos.buffer_pos++) {
-			if (buffer_handle_ptr[iterator.pos.buffer_pos] == '\n' ||
-			    buffer_handle_ptr[iterator.pos.buffer_pos] == '\r') {
-				iterator.pos.buffer_pos++;
-				return;
-			}
+		if (current_state.IsState(CSVState::INVALID)) {
+			return false;
 		}
 	}
+	return true;
 }
 
 bool StringValueScanner::CanDirectlyCast(const LogicalType &type, bool icu_loaded) {
@@ -1463,6 +1448,63 @@ bool StringValueScanner::CanDirectlyCast(const LogicalType &type, bool icu_loade
 	}
 }
 
+bool StringValueScanner::IsRowValid() {
+	if (iterator.pos.buffer_pos == cur_buffer_handle->actual_size) {
+		return false;
+	}
+	constexpr idx_t result_size = 1;
+	auto scan_finder =
+	    make_uniq<StringValueScanner>(0U, buffer_manager, state_machine, make_shared_ptr<CSVErrorHandler>(),
+	                                  csv_file_scan, false, iterator, result_size);
+	auto &tuples = scan_finder->ParseChunk();
+	return tuples.number_of_rows == 1 && tuples.borked_rows.empty();
+}
+
+void StringValueScanner::TryRow(CSVState state, idx_t &start_pos, idx_t &end_pos, bool &valid) {
+	idx_t initial_pos = iterator.pos.buffer_pos;
+	if (SkipUntilState(state, CSVState::RECORD_SEPARATOR)) {
+		idx_t current_pos = iterator.pos.buffer_pos;
+		if (IsRowValid()) {
+			valid = true;
+			start_pos = std::min(start_pos, current_pos);
+		}
+		end_pos = std::max(end_pos, iterator.pos.buffer_pos);
+	}
+	// reset buffer
+	iterator.pos.buffer_pos = initial_pos;
+}
+
+idx_t StringValueScanner::FindNextNewLine() const {
+	idx_t cur_pos = iterator.pos.buffer_pos;
+	// Now skip until next newline
+	if (state_machine->options.dialect_options.state_machine_options.new_line.GetValue() ==
+	    NewLineIdentifier::CARRY_ON) {
+		bool carriage_return = false;
+		bool not_carriage_return = false;
+		for (; cur_pos < cur_buffer_handle->actual_size; cur_pos++) {
+			if (buffer_handle_ptr[cur_pos] == '\r') {
+				carriage_return = true;
+			} else if (buffer_handle_ptr[cur_pos] != '\n') {
+				not_carriage_return = true;
+			}
+			if (buffer_handle_ptr[cur_pos] == '\n') {
+				if (carriage_return || not_carriage_return) {
+					cur_pos++;
+					return cur_pos;
+				}
+			}
+		}
+	} else {
+		for (; cur_pos < cur_buffer_handle->actual_size; cur_pos++) {
+			if (buffer_handle_ptr[cur_pos] == '\n' || buffer_handle_ptr[cur_pos] == '\r') {
+				cur_pos++;
+				return cur_pos;
+			}
+		}
+	}
+	return cur_pos;
+}
+
 void StringValueScanner::SetStart() {
 	if (iterator.first_one) {
 		if (result.store_line_size) {
@@ -1472,50 +1514,41 @@ void StringValueScanner::SetStart() {
 	}
 	// The result size of the data after skipping the row is one line
 	// We have to look for a new line that fits our schema
-	// 1. We walk until the next new line
-	bool line_found;
-	unique_ptr<StringValueScanner> scan_finder;
-	do {
-		constexpr idx_t result_size = 1;
-		SkipUntilNewLine();
-		if (state_machine->options.null_padding) {
-			// When Null Padding, we assume we start from the correct new-line
-			return;
-		}
-		scan_finder =
-		    make_uniq<StringValueScanner>(0U, buffer_manager, state_machine, make_shared_ptr<CSVErrorHandler>(true),
-		                                  csv_file_scan, false, iterator, result_size);
-		auto &tuples = scan_finder->ParseChunk();
-		line_found = true;
-		if (tuples.number_of_rows != 1 ||
-		    (!tuples.borked_rows.empty() && !state_machine->options.ignore_errors.GetValue()) ||
-		    tuples.first_line_is_comment) {
-			line_found = false;
-			// If no tuples were parsed, this is not the correct start, we need to skip until the next new line
-			// Or if columns don't match, this is not the correct start, we need to skip until the next new line
-			if (scan_finder->previous_buffer_handle) {
-				if (scan_finder->iterator.pos.buffer_pos >= scan_finder->previous_buffer_handle->actual_size &&
-				    scan_finder->previous_buffer_handle->is_last_buffer) {
-					iterator.pos.buffer_idx = scan_finder->iterator.pos.buffer_idx;
-					iterator.pos.buffer_pos = scan_finder->iterator.pos.buffer_pos;
-					result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size};
-					iterator.done = scan_finder->iterator.done;
-					return;
-				}
-			}
-			if (iterator.pos.buffer_pos == cur_buffer_handle->actual_size ||
-			    scan_finder->iterator.GetBufferIdx() > iterator.GetBufferIdx()) {
-				// If things go terribly wrong, we never loop indefinitely.
-				iterator.pos.buffer_idx = scan_finder->iterator.pos.buffer_idx;
-				iterator.pos.buffer_pos = scan_finder->iterator.pos.buffer_pos;
-				result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size};
-				iterator.done = scan_finder->iterator.done;
-				return;
-			}
+	idx_t next_new_line = FindNextNewLine();
+	idx_t potential_start = cur_buffer_handle->actual_size;
+	idx_t largest_end_pos = 0;
+	bool any_valid_row = false;
+	if (state_machine->options.null_padding) {
+		// When Null Padding, we assume we start from the correct new-line
+		return;
+	}
+	// At this point we have 3 options:
+	// 1. We are at the start of a valid line
+	TryRow(CSVState::STANDARD_NEWLINE, potential_start, largest_end_pos, any_valid_row);
+	// 2. We are in the middle of a quoted value
+	if (potential_start > next_new_line &&
+	    state_machine->dialect_options.state_machine_options.quote.GetValue() != '\0') {
+		TryRow(CSVState::QUOTED, potential_start, largest_end_pos, any_valid_row);
+	}
+	// 3. We are in an escaped value
+	if (!any_valid_row && potential_start > next_new_line &&
+	    state_machine->dialect_options.state_machine_options.escape.GetValue() != '\0') {
+		TryRow(CSVState::ESCAPE, potential_start, largest_end_pos, any_valid_row);
+	}
+	if (!any_valid_row) {
+		bool is_this_the_end = largest_end_pos == cur_buffer_handle->actual_size && cur_buffer_handle->is_last_buffer;
+		if (is_this_the_end) {
+			iterator.pos.buffer_pos = largest_end_pos;
+			iterator.done = true;
+		} else {
+			SkipUntilState(CSVState::STANDARD_NEWLINE, CSVState::RECORD_SEPARATOR);
 		}
-	} while (!line_found);
-	iterator.pos.buffer_idx = scan_finder->result.current_line_position.begin.buffer_idx;
-	iterator.pos.buffer_pos = scan_finder->result.current_line_position.begin.buffer_pos;
+	} else {
+		iterator.pos.buffer_pos = potential_start;
+		iterator.done = iterator.pos.buffer_pos == cur_buffer_handle->actual_size;
+	}
+	// 4. We have an error, if we have an error, we let life go on, the scanner will either ignore it
+	// or throw.
 	result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size};
 }
 

diff --git a/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp b/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp
@@ -50,14 +50,20 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
 
 	// Now set values depending on configuration
 	// 1) Standard/Invalid State
-	vector<uint8_t> std_inv {static_cast<uint8_t>(CSVState::STANDARD), static_cast<uint8_t>(CSVState::INVALID)};
+	vector<uint8_t> std_inv {static_cast<uint8_t>(CSVState::STANDARD), static_cast<uint8_t>(CSVState::INVALID),
+	                         static_cast<uint8_t>(CSVState::STANDARD_NEWLINE)};
 	for (auto &state : std_inv) {
 		transition_array[delimiter][state] = CSVState::DELIMITER;
-		transition_array[static_cast<uint8_t>('\n')][state] = CSVState::RECORD_SEPARATOR;
 		if (new_line_id == NewLineIdentifier::CARRY_ON) {
 			transition_array[static_cast<uint8_t>('\r')][state] = CSVState::CARRIAGE_RETURN;
+			if (state == static_cast<uint8_t>(CSVState::STANDARD_NEWLINE)) {
+				transition_array[static_cast<uint8_t>('\n')][state] = CSVState::STANDARD;
+			} else {
+				transition_array[static_cast<uint8_t>('\n')][state] = CSVState::INVALID;
+			}
 		} else {
 			transition_array[static_cast<uint8_t>('\r')][state] = CSVState::RECORD_SEPARATOR;
+			transition_array[static_cast<uint8_t>('\n')][state] = CSVState::RECORD_SEPARATOR;
 		}
 		if (comment != '\0') {
 			transition_array[comment][state] = CSVState::COMMENT;

diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_state.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_state.hpp
@@ -14,18 +14,19 @@ namespace duckdb {
 
 //! All States of CSV Parsing
 enum class CSVState : uint8_t {
-	STANDARD = 0,         //! Regular unquoted field state
-	DELIMITER = 1,        //! State after encountering a field separator (e.g., ;)
-	RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
-	CARRIAGE_RETURN = 3,  //! State after encountering a carriage return(i.e., \r)
-	QUOTED = 4,           //! State when inside a quoted field
-	UNQUOTED = 5,         //! State when leaving a quoted field
-	ESCAPE = 6,           //! State when encountering an escape character (e.g., \)
-	INVALID = 7,          //! Got to an Invalid State, this should error.
-	NOT_SET = 8,          //! If the state is not set, usually the first state before getting the first character
-	QUOTED_NEW_LINE = 9,  //! If we have a quoted newline
-	EMPTY_SPACE = 10,     //! If we have empty spaces in the beginning and end of value
-	COMMENT = 11          //! If we are in a comment state, and hence have to skip the whole line
+	STANDARD = 0,          //! Regular unquoted field state
+	DELIMITER = 1,         //! State after encountering a field separator (e.g., ;)
+	RECORD_SEPARATOR = 2,  //! State after encountering a record separator (i.e., \n)
+	CARRIAGE_RETURN = 3,   //! State after encountering a carriage return(i.e., \r)
+	QUOTED = 4,            //! State when inside a quoted field
+	UNQUOTED = 5,          //! State when leaving a quoted field
+	ESCAPE = 6,            //! State when encountering an escape character (e.g., \)
+	INVALID = 7,           //! Got to an Invalid State, this should error.
+	NOT_SET = 8,           //! If the state is not set, usually the first state before getting the first character
+	QUOTED_NEW_LINE = 9,   //! If we have a quoted newline
+	EMPTY_SPACE = 10,      //! If we have empty spaces in the beginning and end of value
+	COMMENT = 11,          //! If we are in a comment state, and hence have to skip the whole line
+	STANDARD_NEWLINE = 12, //! State used for figuring out a new line.
 };
 
 } // namespace duckdb
diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp
@@ -17,65 +17,68 @@ namespace duckdb {
 //! State of necessary CSV States to parse file
 //! Current, previous, and state before the previous
 struct CSVStates {
-	void Initialize() {
-		states[0] = CSVState::NOT_SET;
-		states[1] = CSVState::NOT_SET;
+	void Initialize(CSVState initial_state = CSVState::NOT_SET) {
+		states[0] = initial_state;
+		states[1] = initial_state;
 	}
-	inline bool NewValue() {
+	inline bool NewValue() const {
 		return states[1] == CSVState::DELIMITER;
 	}
 
-	inline bool NewRow() {
+	inline bool NewRow() const {
 		// It is a new row, if the previous state is not a record separator, and the current one is
 		return states[0] != CSVState::RECORD_SEPARATOR && states[0] != CSVState::CARRIAGE_RETURN &&
 		       (states[1] == CSVState::RECORD_SEPARATOR || states[1] == CSVState::CARRIAGE_RETURN);
 	}
 
-	inline bool WasStandard() {
+	inline bool WasStandard() const {
 		return states[0] == CSVState::STANDARD;
 	}
 
-	inline bool EmptyLastValue() {
+	inline bool EmptyLastValue() const {
 		// It is a new row, if the previous state is not a record separator, and the current one is
 		return states[0] == CSVState::DELIMITER &&
 		       (states[1] == CSVState::RECORD_SEPARATOR || states[1] == CSVState::CARRIAGE_RETURN ||
 		        states[1] == CSVState::DELIMITER);
 	}
 
-	inline bool EmptyLine() {
+	inline bool EmptyLine() const {
 		return (states[1] == CSVState::CARRIAGE_RETURN || states[1] == CSVState::RECORD_SEPARATOR) &&
 		       (states[0] == CSVState::RECORD_SEPARATOR || states[0] == CSVState::NOT_SET);
 	}
 
-	inline bool IsNotSet() {
+	inline bool IsNotSet() const {
 		return states[1] == CSVState::NOT_SET;
 	}
 
-	inline bool IsComment() {
+	inline bool IsComment() const {
 		return states[1] == CSVState::COMMENT;
 	}
 
-	inline bool IsCurrentNewRow() {
+	inline bool IsCurrentNewRow() const {
 		return states[1] == CSVState::RECORD_SEPARATOR || states[1] == CSVState::CARRIAGE_RETURN;
 	}
 
-	inline bool IsCarriageReturn() {
+	inline bool IsCarriageReturn() const {
 		return states[1] == CSVState::CARRIAGE_RETURN;
 	}
 
-	inline bool IsInvalid() {
+	inline bool IsInvalid() const {
 		return states[1] == CSVState::INVALID;
 	}
 
-	inline bool IsQuoted() {
+	inline bool IsQuoted() const {
 		return states[0] == CSVState::QUOTED;
 	}
-	inline bool IsEscaped() {
+	inline bool IsEscaped() const {
 		return states[1] == CSVState::ESCAPE || (states[0] == CSVState::UNQUOTED && states[1] == CSVState::QUOTED);
 	}
-	inline bool IsQuotedCurrent() {
+	inline bool IsQuotedCurrent() const {
 		return states[1] == CSVState::QUOTED || states[1] == CSVState::QUOTED_NEW_LINE;
 	}
+	inline bool IsState(const CSVState state) const {
+		return states[1] == state;
+	}
 	CSVState states[2];
 };
 
@@ -98,7 +101,7 @@ class CSVStateMachine {
 		states.states[1] = transition_array[static_cast<uint8_t>(current_char)][static_cast<uint8_t>(states.states[1])];
 	}
 
-	void Print() {
+	void Print() const {
 		std::cout << "State Machine Options" << '\n';
 		std::cout << "Delim: " << state_machine_options.delimiter.GetValue() << '\n';
 		std::cout << "Quote: " << state_machine_options.quote.GetValue() << '\n';
-Original file line number
+Diff line change
@@ -0,0 +1,23 @@
+    a,b,c
+,1,1
+,1,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,\n,1
+,1,1
+,1,1