Bug report
There is a private function _splitlines_no_ff which is only ever called in ast.get_source_segment. This functions splits the entire source given to it, but ast.get_source_segment only needs at most node.end_lineo lines to work.
|
def _splitlines_no_ff(source): |
|
"""Split a string into lines ignoring form feed and other chars. |
|
|
|
This mimics how the Python parser splits source code. |
|
""" |
|
idx = 0 |
|
lines = [] |
|
next_line = '' |
|
while idx < len(source): |
|
c = source[idx] |
|
next_line += c |
|
idx += 1 |
|
# Keep \r\n together |
|
if c == '\r' and idx < len(source) and source[idx] == '\n': |
|
next_line += '\n' |
|
idx += 1 |
|
if c in '\r\n': |
|
lines.append(next_line) |
|
next_line = '' |
|
|
|
if next_line: |
|
lines.append(next_line) |
|
return lines |
|
def get_source_segment(source, node, *, padded=False): |
|
"""Get source code segment of the *source* that generated *node*. |
|
|
|
If some location information (`lineno`, `end_lineno`, `col_offset`, |
|
or `end_col_offset`) is missing, return None. |
|
|
|
If *padded* is `True`, the first line of a multi-line statement will |
|
be padded with spaces to match its original position. |
|
""" |
|
try: |
|
if node.end_lineno is None or node.end_col_offset is None: |
|
return None |
|
lineno = node.lineno - 1 |
|
end_lineno = node.end_lineno - 1 |
|
col_offset = node.col_offset |
|
end_col_offset = node.end_col_offset |
|
except AttributeError: |
|
return None |
|
|
|
lines = _splitlines_no_ff(source) |
|
if end_lineno == lineno: |
|
return lines[lineno].encode()[col_offset:end_col_offset].decode() |
|
|
|
if padded: |
|
padding = _pad_whitespace(lines[lineno].encode()[:col_offset].decode()) |
|
else: |
|
padding = '' |
|
|
|
first = padding + lines[lineno].encode()[col_offset:].decode() |
|
last = lines[end_lineno].encode()[:end_col_offset].decode() |
|
lines = lines[lineno+1:end_lineno] |
|
|
|
lines.insert(0, first) |
|
lines.append(last) |
|
return ''.join(lines) |
If, for example, you want to extract an import line from a very long file, this can seriously degrade performance.
The introduction of a max_lines kwarg in _splitlines_no_ff which functions like maxsplit in str.split would minimize unneeded work. An implementation of the proposed fix is below (which makes my use case twice as fast):
--- a/Lib/ast.py
+++ b/Lib/ast.py
@@ -305,11 +305,16 @@ def get_docstring(node, clean=True):
return text
-def _splitlines_no_ff(source):
+def _splitlines_no_ff(source, max_lines=-1):
"""Split a string into lines ignoring form feed and other chars.
This mimics how the Python parser splits source code.
+
+ If max_lines is given, at most max_lines will be returned. If max_lines is not
+ specified or negative, then there is no limit on the number of lines returned.
"""
+ if not max_lines:
+ return []
idx = 0
lines = []
next_line = ''
@@ -323,6 +328,8 @@ def _splitlines_no_ff(source):
idx += 1
if c in '\r\n':
lines.append(next_line)
+ if max_lines == len(lines):
+ return lines
next_line = ''
if next_line:
@@ -360,7 +367,7 @@ def get_source_segment(source, node, *, padded=False):
except AttributeError:
return None
- lines = _splitlines_no_ff(source)
+ lines = _splitlines_no_ff(source, max_lines=end_lineno + 1)
if end_lineno == lineno:
return lines[lineno].encode()[col_offset:end_col_offset].decode()
Your environment
- CPython versions tested on: 3.11
Linked PRs
Bug report
There is a private function
_splitlines_no_ffwhich is only ever called inast.get_source_segment. This functions splits the entire source given to it, butast.get_source_segmentonly needs at mostnode.end_lineolines to work.cpython/Lib/ast.py
Lines 308 to 330 in 1acdfec
cpython/Lib/ast.py
Lines 344 to 378 in 1acdfec
If, for example, you want to extract an import line from a very long file, this can seriously degrade performance.
The introduction of a
max_lineskwarg in_splitlines_no_ffwhich functions likemaxsplitinstr.splitwould minimize unneeded work. An implementation of the proposed fix is below (which makes my use case twice as fast):Your environment
Linked PRs