@@ -31,6 +31,11 @@ typedef struct
31
31
{
32
32
PyObject_HEAD struct tok_state * tok ;
33
33
int done ;
34
+
35
+ /* Needed to cache line for performance */
36
+ PyObject * last_line ;
37
+ Py_ssize_t last_lineno ;
38
+ Py_ssize_t byte_col_offset_diff ;
34
39
} tokenizeriterobject ;
35
40
36
41
/*[clinic input]
@@ -67,6 +72,11 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
67
72
self -> tok -> tok_extra_tokens = 1 ;
68
73
}
69
74
self -> done = 0 ;
75
+
76
+ self -> last_line = NULL ;
77
+ self -> byte_col_offset_diff = 0 ;
78
+ self -> last_lineno = 0 ;
79
+
70
80
return (PyObject * )self ;
71
81
}
72
82
@@ -209,7 +219,18 @@ tokenizeriter_next(tokenizeriterobject *it)
209
219
if (size >= 1 && it -> tok -> implicit_newline ) {
210
220
size -= 1 ;
211
221
}
212
- line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
222
+
223
+ if (it -> tok -> lineno != it -> last_lineno ) {
224
+ // Line has changed since last token, so we fetch the new line and cache it
225
+ // in the iter object.
226
+ Py_XDECREF (it -> last_line );
227
+ line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
228
+ it -> last_line = line ;
229
+ it -> byte_col_offset_diff = 0 ;
230
+ } else {
231
+ // Line hasn't changed so we reuse the cached one.
232
+ line = it -> last_line ;
233
+ }
213
234
}
214
235
if (line == NULL ) {
215
236
Py_DECREF (str );
@@ -218,13 +239,28 @@ tokenizeriter_next(tokenizeriterobject *it)
218
239
219
240
Py_ssize_t lineno = ISSTRINGLIT (type ) ? it -> tok -> first_lineno : it -> tok -> lineno ;
220
241
Py_ssize_t end_lineno = it -> tok -> lineno ;
242
+ it -> last_lineno = lineno ;
243
+
221
244
Py_ssize_t col_offset = -1 ;
222
245
Py_ssize_t end_col_offset = -1 ;
246
+ Py_ssize_t byte_offset = -1 ;
223
247
if (token .start != NULL && token .start >= line_start ) {
224
- col_offset = _PyPegen_byte_offset_to_character_offset (line , token .start - line_start );
248
+ byte_offset = token .start - line_start ;
249
+ col_offset = byte_offset - it -> byte_col_offset_diff ;
225
250
}
226
251
if (token .end != NULL && token .end >= it -> tok -> line_start ) {
227
- end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , token .end - it -> tok -> line_start );
252
+ Py_ssize_t end_byte_offset = token .end - it -> tok -> line_start ;
253
+ if (lineno == end_lineno ) {
254
+ // If the whole token is at the same line, we can just use the token.start
255
+ // buffer for figuring out the new column offset, since using line is not
256
+ // performant for very long lines.
257
+ Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line (line , byte_offset , end_byte_offset );
258
+ end_col_offset = col_offset + token_col_offset ;
259
+ it -> byte_col_offset_diff += token .end - token .start - token_col_offset ;
260
+ } else {
261
+ end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , end_byte_offset );
262
+ it -> byte_col_offset_diff += end_byte_offset - end_col_offset ;
263
+ }
228
264
}
229
265
230
266
if (it -> tok -> tok_extra_tokens ) {
@@ -264,7 +300,7 @@ tokenizeriter_next(tokenizeriterobject *it)
264
300
}
265
301
}
266
302
267
- result = Py_BuildValue ("(iN(nn)(nn)N )" , type , str , lineno , col_offset , end_lineno , end_col_offset , line );
303
+ result = Py_BuildValue ("(iN(nn)(nn)O )" , type , str , lineno , col_offset , end_lineno , end_col_offset , line );
268
304
exit :
269
305
_PyToken_Free (& token );
270
306
if (type == ENDMARKER ) {
0 commit comments