From 516cdacdc9407f17ab0e471b40df1b198adca02a Mon Sep 17 00:00:00 2001 From: David Riazati Date: Tue, 15 Jun 2021 15:33:01 -0700 Subject: [PATCH 1/4] Use GitHub's diff directly in clang-tidy This changes clang-tidy in lint.yml to pull the raw diff from GitHub and parse that rather than use the PRs base revision. The base revision can cause the spurious inclusion of files not changed in the PR as in https://github.com/pytorch/pytorch/pull/59967/checks?check_run_id=2832565901. We could be smarter about how we query git, but this approach ends up being simpler since we just need to search for the diff headers in the .diff file. [ghstack-poisoned] --- .github/workflows/lint.yml | 47 +++++++++++++++++---------------- tools/clang_tidy.py | 53 +++++++++++++++----------------------- 2 files changed, 46 insertions(+), 54 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 550f8abb98da8..dfd71873c534c 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -312,12 +312,14 @@ jobs: fi - name: Run clang-tidy env: - BASE_SHA: ${{ github.event.pull_request.base.sha }} HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PR_NUMBER: ${{ github.event.pull_request.number }} run: | cd "${GITHUB_WORKSPACE}" set -eux + wget -O pr.diff "https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/$PR_NUMBER.diff" + # Run Clang-Tidy # The negative filters below are to exclude files that include onnx_pb.h or # caffe2_pb.h, otherwise we'd have to build protos as part of this CI job. @@ -326,27 +328,28 @@ jobs: # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built. # deploy/interpreter files are excluded due to using macros and other techniquies # that are not easily converted to accepted c++ - python3 tools/clang_tidy.py \ - --verbose \ - --paths torch/csrc/ \ - --diff "$BASE_SHA" \ - -g"-torch/csrc/jit/passes/onnx/helper.cpp" \ - -g"-torch/csrc/jit/passes/onnx/shape_type_inference.cpp"\ - -g"-torch/csrc/jit/serialization/onnx.cpp" \ - -g"-torch/csrc/jit/serialization/export.cpp" \ - -g"-torch/csrc/jit/serialization/import.cpp" \ - -g"-torch/csrc/jit/serialization/import_legacy.cpp" \ - -g"-torch/csrc/onnx/init.cpp" \ - -g"-torch/csrc/cuda/nccl.*" \ - -g"-torch/csrc/cuda/python_nccl.cpp" \ - -g"-torch/csrc/autograd/FunctionsManual.cpp" \ - -g"-torch/csrc/generic/*.cpp" \ - -g"-torch/csrc/jit/codegen/cuda/runtime/*" \ - -g"-torch/csrc/deploy/interpreter/interpreter.cpp" \ - -g"-torch/csrc/deploy/interpreter/interpreter.h" \ - -g"-torch/csrc/deploy/interpreter/interpreter_impl.h" \ - -g"-torch/csrc/deploy/interpreter/test_main.cpp" \ - "$@" > "${GITHUB_WORKSPACE}"/clang-tidy-output.txt + python3 tools/clang_tidy.py \ + --verbose \ + --paths torch/csrc/ \ + --diff-file pr.diff \ + -g"-torch/csrc/jit/passes/onnx/helper.cpp" \ + -g"-torch/csrc/jit/passes/onnx/shape_type_inference.cpp" \ + -g"-torch/csrc/jit/serialization/onnx.cpp" \ + -g"-torch/csrc/jit/serialization/export.cpp" \ + -g"-torch/csrc/jit/serialization/import.cpp" \ + -g"-torch/csrc/jit/serialization/import_legacy.cpp" \ + -g"-torch/csrc/onnx/init.cpp" \ + -g"-torch/csrc/cuda/nccl.*" \ + -g"-torch/csrc/cuda/python_nccl.cpp" \ + -g"-torch/csrc/autograd/FunctionsManual.cpp" \ + -g"-torch/csrc/generic/*.cpp" \ + -g"-torch/csrc/jit/codegen/cuda/runtime/*" \ + -g"-torch/csrc/deploy/interpreter/interpreter.cpp" \ + -g"-torch/csrc/deploy/interpreter/interpreter.h" \ + -g"-torch/csrc/deploy/interpreter/interpreter_impl.h" \ + -g"-torch/csrc/deploy/interpreter/test_main.cpp" \ + "$@" >"${GITHUB_WORKSPACE}"/clang-tidy-output.txt + cat "${GITHUB_WORKSPACE}"/clang-tidy-output.txt diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py index f5c71f41cd3d2..a306e9047c745 100755 --- a/tools/clang_tidy.py +++ b/tools/clang_tidy.py @@ -21,7 +21,6 @@ import os import os.path import re -import shlex import shutil import subprocess import sys @@ -32,7 +31,7 @@ except ImportError: from pipes import quote -from typing import Any, Dict, Iterable, List, Set, Union +from typing import Any, Dict, Iterable, List, Set, IO, Tuple Patterns = collections.namedtuple("Patterns", "positive, negative") @@ -44,6 +43,7 @@ # @@ -start,count +start,count @@ CHUNK_PATTERN = r"^@@\s+-\d+(?:,\d+)?\s+\+(\d+)(?:,(\d+))?\s+@@" +CHUNK_HEADER_RE = r"diff --git .*?\nindex.*?\n---.*?\n\+\+\+ b/(.*?)\n@@ -(\d+,\d+) \+(\d+,\d+) @@" CLANG_WARNING_PATTERN = re.compile(r"([^:]+):(\d+):\d+:\s+warning:.*\[([^\]]+)\]") @@ -125,35 +125,26 @@ def filter_files(files: Iterable[str], file_patterns: Patterns) -> Iterable[str] print("{} omitted due to file filters".format(file)) -def get_changed_files(revision: str, paths: List[str]) -> List[str]: - """Runs git diff to get the paths of all changed files.""" - # --diff-filter AMU gets us files that are (A)dded, (M)odified or (U)nmerged (in the working copy). - # --name-only makes git diff return only the file paths, without any of the source changes. - command = "git diff-index --diff-filter=AMU --ignore-all-space --name-only" - output = run_shell_command(shlex.split(command) + [revision] + paths) - return output.split("\n") - - def get_all_files(paths: List[str]) -> List[str]: """Returns all files that are tracked by git in the given paths.""" output = run_shell_command(["git", "ls-files"] + paths) return output.split("\n") -def get_changed_lines(revision: str, filename: str) -> Dict[str, Union[str, List[List[int]]]]: - """Runs git diff to get the line ranges of all file changes.""" - command = shlex.split("git diff-index --unified=0") + [revision, filename] - output = run_shell_command(command) - changed_lines = [] - for chunk in re.finditer(CHUNK_PATTERN, output, re.MULTILINE): - start = int(chunk.group(1)) - count = int(chunk.group(2) or 1) - # If count == 0, a chunk was removed and can be ignored. - if count == 0: - continue - changed_lines.append([start, start + count]) +def find_changed_lines_from_diff(f: IO[str]) -> Dict[str, List[Tuple[int, int]]]: + content = f.read() + files = collections.defaultdict(list) + + matches = re.findall(CHUNK_HEADER_RE, content, re.MULTILINE) + for file, start, end in matches: + start_line, _ = start.split(",") + end_line, _ = end.split(",") + print(file, start_line, end_line) + + files[file].append((start_line, end_line)) + + return dict(files) - return {"name": filename, "lines": changed_lines} ninja_template = """ rule do_cmd @@ -180,7 +171,7 @@ def run_shell_commands_in_parallel(commands: Iterable[List[str]]) -> str: return run_shell_command(['ninja', '-f', f.name]) -def run_clang_tidy(options: Any, line_filters: Any, files: Iterable[str]) -> str: +def run_clang_tidy(options: Any, line_filters: Dict[str, List[Tuple[int, int]]], files: Iterable[str]) -> str: """Executes the actual clang-tidy command in the shell.""" command = [options.clang_tidy_exe, "-p", options.compile_commands_dir] if not options.config_file and os.path.exists(".clang-tidy"): @@ -283,7 +274,7 @@ def parse_options() -> Any: help="Path to the folder containing compile_commands.json", ) parser.add_argument( - "-d", "--diff", help="Git revision to diff against to get changes" + "--diff-file", help="File containing diff to use for determining files to lint and line filters" ) parser.add_argument( "-p", @@ -333,8 +324,10 @@ def main() -> None: # Normalize the paths first. paths = [path.rstrip("/") for path in options.paths] - if options.diff: - files = get_changed_files(options.diff, paths) + if options.diff_file: + with open(options.diff_file, "r") as f: + line_filters = find_changed_lines_from_diff(f) + files = list(line_filters.keys()) else: files = get_all_files(paths) file_patterns = get_file_patterns(options.glob, options.regex) @@ -345,10 +338,6 @@ def main() -> None: print("No files detected.") sys.exit() - line_filters = [] - if options.diff: - line_filters = [get_changed_lines(options.diff, f) for f in files] - clang_tidy_output = run_clang_tidy(options, line_filters, files) if options.suppress_diagnostics: warnings = extract_warnings(clang_tidy_output, base_dir=options.compile_commands_dir) From 2cc115eb039044178cce928ac682b214ef538d14 Mon Sep 17 00:00:00 2001 From: David Riazati Date: Tue, 15 Jun 2021 15:45:25 -0700 Subject: [PATCH 2/4] Update on "Use GitHub's diff directly in clang-tidy" This changes clang-tidy in lint.yml to pull the raw diff from GitHub and parse that rather than use the PRs base revision. The base revision can cause the spurious inclusion of files not changed in the PR as in https://github.com/pytorch/pytorch/pull/59967/checks?check_run_id=2832565901. We could be smarter about how we query git, but this approach ends up being simpler since we just need to search for the diff headers in the .diff file. See https://github.com/pytorch/pytorch/pull/60049/checks?check_run_id=2834140350 for an example CI run with this on [ghstack-poisoned] --- tools/clang_tidy.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py index a306e9047c745..3a8296e62c52c 100755 --- a/tools/clang_tidy.py +++ b/tools/clang_tidy.py @@ -326,8 +326,11 @@ def main() -> None: paths = [path.rstrip("/") for path in options.paths] if options.diff_file: with open(options.diff_file, "r") as f: - line_filters = find_changed_lines_from_diff(f) - files = list(line_filters.keys()) + changed_files = find_changed_lines_from_diff(f) + line_filters = [ + {"name": name, "lines": lines} for name, lines, in changed_files.items() + ] + files = list(changed_files.keys()) else: files = get_all_files(paths) file_patterns = get_file_patterns(options.glob, options.regex) From 51acff253beacbf54e9cb47715cd8ab4df1b54b7 Mon Sep 17 00:00:00 2001 From: David Riazati Date: Tue, 15 Jun 2021 15:54:08 -0700 Subject: [PATCH 3/4] Update on "Use GitHub's diff directly in clang-tidy" This changes clang-tidy in lint.yml to pull the raw diff from GitHub and parse that rather than use the PRs base revision. The base revision can cause the spurious inclusion of files not changed in the PR as in https://github.com/pytorch/pytorch/pull/59967/checks?check_run_id=2832565901. We could be smarter about how we query git, but this approach ends up being simpler since we just need to search for the diff headers in the .diff file. See https://github.com/pytorch/pytorch/pull/60049/checks?check_run_id=2834140350 for an example CI run with this on [ghstack-poisoned] --- tools/clang_tidy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py index 3a8296e62c52c..294c427386080 100755 --- a/tools/clang_tidy.py +++ b/tools/clang_tidy.py @@ -171,7 +171,7 @@ def run_shell_commands_in_parallel(commands: Iterable[List[str]]) -> str: return run_shell_command(['ninja', '-f', f.name]) -def run_clang_tidy(options: Any, line_filters: Dict[str, List[Tuple[int, int]]], files: Iterable[str]) -> str: +def run_clang_tidy(options: Any, line_filters: List[Dict[str, Any]], files: Iterable[str]) -> str: """Executes the actual clang-tidy command in the shell.""" command = [options.clang_tidy_exe, "-p", options.compile_commands_dir] if not options.config_file and os.path.exists(".clang-tidy"): @@ -332,6 +332,7 @@ def main() -> None: ] files = list(changed_files.keys()) else: + line_filters = [] files = get_all_files(paths) file_patterns = get_file_patterns(options.glob, options.regex) files = list(filter_files(files, file_patterns)) From d201f9d898a3723f4d386c67015a1bce4bdc91a5 Mon Sep 17 00:00:00 2001 From: David Riazati Date: Tue, 15 Jun 2021 16:20:22 -0700 Subject: [PATCH 4/4] Update on "Use GitHub's diff directly in clang-tidy" This changes clang-tidy in lint.yml to pull the raw diff from GitHub and parse that rather than use the PRs base revision. The way we diff with the base revision can cause the spurious inclusion of files not changed in the PR as in https://github.com/pytorch/pytorch/pull/59967/checks?check_run_id=2832565901. We could be smarter about how we query git, but this approach ends up being simpler since we just need to search for the diff headers in the .diff file. See https://github.com/pytorch/pytorch/pull/60049/checks?check_run_id=2834254915 for an example CI run with this on (the failure is expected, the relevant part there is the fact that the line filter still works fine) Differential Revision: [D29148886](https://our.internmc.facebook.com/intern/diff/D29148886) [ghstack-poisoned] --- tools/clang_tidy.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py index 294c427386080..7574c4f3b538e 100755 --- a/tools/clang_tidy.py +++ b/tools/clang_tidy.py @@ -31,7 +31,7 @@ except ImportError: from pipes import quote -from typing import Any, Dict, Iterable, List, Set, IO, Tuple +from typing import Any, Dict, Iterable, List, Set, Tuple Patterns = collections.namedtuple("Patterns", "positive, negative") @@ -41,9 +41,13 @@ # (c/cc/cpp) file. DEFAULT_FILE_PATTERN = re.compile(r".*\.c(c|pp)?") -# @@ -start,count +start,count @@ -CHUNK_PATTERN = r"^@@\s+-\d+(?:,\d+)?\s+\+(\d+)(?:,(\d+))?\s+@@" +# Search for: +# diff --git ... +# index ... +# --- ... +# +++ ... CHUNK_HEADER_RE = r"diff --git .*?\nindex.*?\n---.*?\n\+\+\+ b/(.*?)\n@@ -(\d+,\d+) \+(\d+,\d+) @@" + CLANG_WARNING_PATTERN = re.compile(r"([^:]+):(\d+):\d+:\s+warning:.*\[([^\]]+)\]") @@ -131,11 +135,10 @@ def get_all_files(paths: List[str]) -> List[str]: return output.split("\n") -def find_changed_lines_from_diff(f: IO[str]) -> Dict[str, List[Tuple[int, int]]]: - content = f.read() +def find_changed_lines(diff: str) -> Dict[str, List[Tuple[int, int]]]: files = collections.defaultdict(list) - matches = re.findall(CHUNK_HEADER_RE, content, re.MULTILINE) + matches = re.findall(CHUNK_HEADER_RE, diff, re.MULTILINE) for file, start, end in matches: start_line, _ = start.split(",") end_line, _ = end.split(",") @@ -326,7 +329,7 @@ def main() -> None: paths = [path.rstrip("/") for path in options.paths] if options.diff_file: with open(options.diff_file, "r") as f: - changed_files = find_changed_lines_from_diff(f) + changed_files = find_changed_lines(f.read()) line_filters = [ {"name": name, "lines": lines} for name, lines, in changed_files.items() ]