|
| 1 | +import sys |
| 2 | +from typing import BinaryIO, Callable, Iterable, cast |
| 3 | + |
| 4 | +from .. import core |
| 5 | + |
| 6 | + |
| 7 | +type Ranges = list[int | tuple[int, int | None]] |
| 8 | +type RangeChecker = Callable[[int], bool] |
| 9 | + |
| 10 | +type Cutter = Callable[[bytes], bytes | None] |
| 11 | + |
| 12 | + |
| 13 | +def get_check_range(ranges: Ranges, complement: bool) -> RangeChecker: |
| 14 | + def check_range(pos: int) -> bool: |
| 15 | + for r in ranges: |
| 16 | + match r: |
| 17 | + case [min_pos, None]: |
| 18 | + if pos >= min_pos: |
| 19 | + return True |
| 20 | + case [min_pos, max_pos]: |
| 21 | + if min_pos <= pos <= max_pos: |
| 22 | + return True |
| 23 | + case wanted_pos: |
| 24 | + if pos == wanted_pos: |
| 25 | + return True |
| 26 | + |
| 27 | + return False |
| 28 | + |
| 29 | + return (lambda pos: not check_range(pos)) if complement else check_range |
| 30 | + |
| 31 | + |
| 32 | +def get_cut_by_bytes(check_range: RangeChecker, line_terminator: bytes) -> Cutter: |
| 33 | + def cut_by_bytes(data: bytes) -> bytes: |
| 34 | + return b"".join( |
| 35 | + [n.to_bytes() for i, n in enumerate(data) if check_range(i + 1)] |
| 36 | + + [line_terminator] |
| 37 | + ) |
| 38 | + |
| 39 | + return cut_by_bytes |
| 40 | + |
| 41 | + |
| 42 | +def get_cut_by_fields( |
| 43 | + check_range: RangeChecker, |
| 44 | + input_delimiter: bytes, |
| 45 | + output_delimiter: bytes, |
| 46 | + only_delimited: bool, |
| 47 | +) -> Cutter: |
| 48 | + def cut_by_fields(data: bytes) -> bytes | None: |
| 49 | + fields = data.split(input_delimiter) |
| 50 | + |
| 51 | + if len(fields) < 2: |
| 52 | + return None if only_delimited else data |
| 53 | + |
| 54 | + return output_delimiter.join( |
| 55 | + [field for i, field in enumerate(fields) if check_range(i + 1)] |
| 56 | + ) |
| 57 | + |
| 58 | + return cut_by_fields |
| 59 | + |
| 60 | + |
| 61 | +def cut_and_print_stream(stream: Iterable[bytes], cutter: Cutter) -> None: |
| 62 | + for line in stream: |
| 63 | + if (processed := cutter(line)) is not None: |
| 64 | + sys.stdout.buffer.write(processed) |
| 65 | + sys.stdout.buffer.flush() |
| 66 | + |
| 67 | + |
| 68 | +parser = core.ExtendedOptionParser(usage="%prog OPTION... [FILE]...", description="wow") |
| 69 | + |
| 70 | +parser.add_option("-b", "--bytes", metavar="LIST", help="select bytes in LIST") |
| 71 | +parser.add_option("-c", "--characters", metavar="LIST", help="identical to -b") |
| 72 | +parser.add_option("-f", "--fields", metavar="LIST", help="select fields in LIST") |
| 73 | + |
| 74 | +parser.add_option("--complement", action="store_true", help="invert selection") |
| 75 | + |
| 76 | +parser.add_option( |
| 77 | + "-s", |
| 78 | + "--only-delimited", |
| 79 | + action="store_true", |
| 80 | + help="ignore lines not containing the delimiter", |
| 81 | +) |
| 82 | +parser.add_option( |
| 83 | + "-d", |
| 84 | + "--delimiter", |
| 85 | + metavar="STRING", |
| 86 | + help="use STRING instead of TAB as field delimiter", |
| 87 | +) |
| 88 | +parser.add_option( |
| 89 | + "--output-delimiter", |
| 90 | + metavar="STRING", |
| 91 | + help="use STRING instead of input delimiter as output delimiter", |
| 92 | +) |
| 93 | + |
| 94 | +parser.add_option( |
| 95 | + "-z", |
| 96 | + "--zero-terminated", |
| 97 | + action="store_true", |
| 98 | + help="line delimiter is NUL instead of newline", |
| 99 | +) |
| 100 | + |
| 101 | +parser.add_option( |
| 102 | + "-n", action="store_true", help="(ignored; present for POSIX compatibility)" |
| 103 | +) |
| 104 | + |
| 105 | + |
| 106 | +def parse_range(range_specs: str) -> Ranges: |
| 107 | + ranges: Ranges = [] |
| 108 | + |
| 109 | + for range_spec in range_specs.split(","): |
| 110 | + parts = range_spec.split("-") |
| 111 | + |
| 112 | + try: |
| 113 | + match parts: |
| 114 | + case [n]: |
| 115 | + ranges.append(int(n)) |
| 116 | + case [n, ""]: |
| 117 | + ranges.append((int(n), None)) |
| 118 | + case ["", m]: |
| 119 | + ranges.append((0, int(m))) |
| 120 | + case [n, m]: |
| 121 | + ranges.append((int(n), int(m))) |
| 122 | + case _: |
| 123 | + raise ValueError |
| 124 | + except ValueError: |
| 125 | + parser.error(f"invalid range specification: {range_specs}") |
| 126 | + |
| 127 | + return ranges |
| 128 | + |
| 129 | + |
| 130 | +@core.command(parser) |
| 131 | +def python_userland_cut(opts, args: list[str]) -> int: |
| 132 | + cutter: Cutter |
| 133 | + |
| 134 | + match (opts.bytes, opts.characters, opts.fields): |
| 135 | + case (None, None, None): |
| 136 | + parser.error("expected one of --bytes, --characters or --fields") |
| 137 | + case (byte_range_spec, None, None) | (None, byte_range_spec, None): |
| 138 | + if opts.delimiter: |
| 139 | + parser.error("--delimiter is only allowed with --fields") |
| 140 | + |
| 141 | + if opts.only_delimited: |
| 142 | + parser.error("--only-delimited is only allowed with --fields") |
| 143 | + |
| 144 | + cutter = get_cut_by_bytes( |
| 145 | + check_range=get_check_range( |
| 146 | + parse_range(cast(str, byte_range_spec)), opts.complement |
| 147 | + ), |
| 148 | + line_terminator=b"\0" if opts.zero_terminated else b"\n", |
| 149 | + ) |
| 150 | + case (None, None, field_range_spec): |
| 151 | + opts.delimiter = opts.delimiter or "\t" |
| 152 | + |
| 153 | + if len(opts.delimiter) > 1: |
| 154 | + parser.error("the delimiter must be a single character") |
| 155 | + |
| 156 | + cutter = get_cut_by_fields( |
| 157 | + check_range=get_check_range( |
| 158 | + parse_range(field_range_spec), opts.complement |
| 159 | + ), |
| 160 | + input_delimiter=(input_delimiter := opts.delimiter.encode()), |
| 161 | + output_delimiter=( |
| 162 | + opts.output_delimiter.encode() |
| 163 | + if opts.output_delimiter is not None |
| 164 | + else input_delimiter |
| 165 | + ), |
| 166 | + only_delimited=opts.only_delimited, |
| 167 | + ) |
| 168 | + case _: |
| 169 | + parser.error("only one list may be specified") |
| 170 | + |
| 171 | + append_newline = False |
| 172 | + |
| 173 | + # This is a hack to handle "\n" as a field delimiter. |
| 174 | + def process_line_stream(stream: BinaryIO) -> Iterable[bytes]: |
| 175 | + nonlocal append_newline |
| 176 | + |
| 177 | + if not (opts.fields and opts.delimiter == "\n"): |
| 178 | + return stream |
| 179 | + |
| 180 | + data = stream.read() |
| 181 | + if data and data[-1] == ord(b"\n"): |
| 182 | + # Don't treat the last newline as a delimiter. |
| 183 | + data = data[:-1] |
| 184 | + append_newline = True |
| 185 | + |
| 186 | + return (data for _ in (None,)) |
| 187 | + |
| 188 | + failed = False |
| 189 | + |
| 190 | + for name in args or ["-"]: |
| 191 | + append_newline = False |
| 192 | + |
| 193 | + if name == "-": |
| 194 | + cut_and_print_stream( |
| 195 | + ( |
| 196 | + core.get_lines_by_delimiter(sys.stdin.buffer, b"\0") |
| 197 | + if opts.zero_terminated |
| 198 | + else process_line_stream(sys.stdin.buffer) |
| 199 | + ), |
| 200 | + cutter, |
| 201 | + ) |
| 202 | + else: |
| 203 | + try: |
| 204 | + with open(name, "rb") as f: |
| 205 | + cut_and_print_stream( |
| 206 | + ( |
| 207 | + core.get_lines_by_delimiter(f, b"\0") |
| 208 | + if opts.zero_terminated |
| 209 | + else process_line_stream(f) |
| 210 | + ), |
| 211 | + cutter, |
| 212 | + ) |
| 213 | + except OSError as e: |
| 214 | + failed = True |
| 215 | + core.perror(e) |
| 216 | + continue |
| 217 | + |
| 218 | + if append_newline: |
| 219 | + print() |
| 220 | + |
| 221 | + return int(failed) |
0 commit comments