Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit fe875c0

Browse files
Don't double-decode input, causing non-UTF-8 files to be corrupted (psf#4964)
Co-authored-by: MeGaGiGaGon <[email protected]>
1 parent 5cdb4b6 commit fe875c0

3 files changed

Lines changed: 47 additions & 5 deletions

File tree

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
<!-- Changes that affect Black's stable style -->
1515

16+
- Don't double-decode input, causing non-UTF-8 files to be corrupted (#4964)
17+
1618
### Preview style
1719

1820
<!-- Changes that affect Black's preview style -->

src/black/__init__.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1218,8 +1218,10 @@ def f(
12181218
def _format_str_once(
12191219
src_contents: str, *, mode: Mode, lines: Collection[tuple[int, int]] = ()
12201220
) -> str:
1221+
# Use the encoding overwrite since the src_contents may contain a different
1222+
# magic encoding comment than utf-8
12211223
normalized_contents, _, newline_type = decode_bytes(
1222-
src_contents.encode("utf-8"), mode
1224+
src_contents.encode("utf-8"), mode, encoding_overwrite="utf-8"
12231225
)
12241226

12251227
src_node = lib2to3_parse(
@@ -1276,14 +1278,25 @@ def _format_str_once(
12761278
return "".join(dst_contents).replace("\n", newline_type)
12771279

12781280

1279-
def decode_bytes(src: bytes, mode: Mode) -> tuple[FileContent, Encoding, NewLine]:
1281+
def decode_bytes(
1282+
src: bytes, mode: Mode, *, encoding_overwrite: str | None = None
1283+
) -> tuple[FileContent, Encoding, NewLine]:
12801284
"""Return a tuple of (decoded_contents, encoding, newline).
12811285
1282-
`newline` is either CRLF or LF but `decoded_contents` is decoded with
1286+
`newline` is either CRLF, LF, or CR, but `decoded_contents` is decoded with
12831287
universal newlines (i.e. only contains LF).
1288+
1289+
Use the keyword only encoding_overwrite argument if the bytes are encoded
1290+
differently to their possible encoding magic comment.
12841291
"""
12851292
srcbuf = io.BytesIO(src)
1293+
1294+
# Still use detect encoding even if overrite set because otherwise lines
1295+
# might be different
12861296
encoding, lines = tokenize.detect_encoding(srcbuf.readline)
1297+
if encoding_overwrite is not None:
1298+
encoding = encoding_overwrite
1299+
12871300
if not lines:
12881301
return "", encoding, "\n"
12891302

tests/test_black.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from io import BytesIO
2020
from pathlib import Path, WindowsPath
2121
from platform import system
22-
from tempfile import TemporaryDirectory
22+
from tempfile import NamedTemporaryFile, TemporaryDirectory
2323
from typing import Any, TypeVar
2424
from unittest.mock import MagicMock, patch
2525

@@ -2080,12 +2080,39 @@ def test_carriage_return_edge_cases(self) -> None:
20802080
== "class A: ...\r"
20812081
)
20822082

2083-
def test_preview_newline_type_detection(self) -> None:
2083+
def test_newline_type_detection(self) -> None:
20842084
mode = Mode()
20852085
newline_types = ["A\n", "A\r\n", "A\r"]
20862086
for test_case in itertools.permutations(newline_types):
20872087
assert black.format_str("".join(test_case), mode=mode) == test_case[0] * 3
20882088

2089+
def test_decode_with_encoding(self) -> None:
2090+
# This uses temporary files since some editors (including GitHub)
2091+
# struggle with displaying and/or editing non utf-8 data
2092+
# \xfc is iso-8859-1 for ü
2093+
with NamedTemporaryFile(delete=False) as first_line:
2094+
first_line.write(
2095+
b"# -*- coding: iso-8859-1 -*-\n"
2096+
b"# 2002-11-22 J\xfcrgen Hermann <[email protected]>\n"
2097+
)
2098+
first_line.close()
2099+
self.assertFalse(
2100+
ff(Path(first_line.name)),
2101+
"Failed to properly detect encoding",
2102+
)
2103+
2104+
with NamedTemporaryFile(delete=False) as second_line:
2105+
second_line.write(
2106+
b"#! /usr/bin/env python3\n"
2107+
b"# -*- coding: iso-8859-1 -*-\n"
2108+
b"# 2002-11-22 J\xfcrgen Hermann <[email protected]>\n"
2109+
)
2110+
second_line.close()
2111+
self.assertFalse(
2112+
ff(Path(second_line.name)),
2113+
"Failed to properly detect encoding on second line",
2114+
)
2115+
20892116

20902117
class TestCaching:
20912118
def test_get_cache_dir(

0 commit comments

Comments
 (0)