diff --git a/README.rst b/README.rst index 47c8032900..598a57f76a 100644 --- a/README.rst +++ b/README.rst @@ -172,6 +172,23 @@ Words should be separated by a comma. def wrod(wrods): pass +Ignoring misspellings marked with "[sic]" +----------------------------------------- + +The ``--ignore-sic`` option tells codespell to skip a misspelling that is +followed by the editorial ``[sic]`` marker (case-insensitive). Only the single +occurrence preceding the marker is ignored, so other misspellings on the same +line are still reported. A closing quote may sit between the word and the +marker, which is the common case when documenting a corrected typo (for example +in a changelog): + +.. code-block:: text + + correct the "wrod" [sic] typo in a changelog entry + +Unlike ``codespell:ignore``, the marker is part of the prose itself and does not +require naming the word in a tooling comment. + Using a config file ------------------- diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 3486b36132..7d37078804 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -63,6 +63,9 @@ ignore_next_line_regex = re.compile( rf"[^\w\s]\s*{codespell_ignore_next_line_tag}\b(\s+(?P[\w,]*))?" ) +# Editorial "[sic]" marker following a word, allowing an intervening closing +# quote so a quoted typo like `"wrod" [sic]` is matched. +sic_regex = re.compile(r"[\"'’”)\s]*\[sic\]", re.IGNORECASE) # noqa: RUF001 USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] """ @@ -658,6 +661,13 @@ def convert_arg_line_to_args(self, arg_line: str) -> list[str]: metavar="LINES", help="print LINES of surrounding context", ) + parser.add_argument( + "--ignore-sic", + action="store_true", + default=False, + help='ignore a misspelling immediately followed by the editorial "[sic]" ' + "marker (optionally preceded by a closing quote).", + ) parser.add_argument( "--stdin-single-line", action="store_true", @@ -1049,6 +1059,11 @@ def parse_lines( ): continue + # An "[sic]" marker right after the word flags it as an + # intentional/quoted spelling, so leave it alone. + if options.ignore_sic and sic_regex.match(line, match.end()): + continue + context_shown = False fix = misspellings[lword].fix fixword = fix_case(word, misspellings[lword].data) diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index fef55d9c03..fe11084955 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -493,6 +493,46 @@ def test_inline_ignores( assert cs.main(d) == expected_error_count +@pytest.mark.parametrize( + ("content", "expected_error_count"), + [ + # marker right after the word (optional whitespace) excuses it + ("they wrod [sic] it\n", 0), + ("they wrod[sic] it\n", 0), + ("they wrod [sic] it\n", 0), + # case-insensitive marker + ("they wrod [SIC] it\n", 0), + # quoted typo followed by the marker (changelog use case) + ('correct "wrod" [sic] typo\n', 0), + ('correct "wrod"[sic] typo\n', 0), + # only the immediately preceding occurrence is excused + ("wrod wrod [sic]\n", 1), + # a marker elsewhere on the line does not excuse the word + ("wrod it [sic] anyway abilty\n", 2), + # an intervening word breaks the association + ('wrod" abilty [sic]\n', 1), + # without a marker the misspelling is still reported + ("they wrod it\n", 1), + # not a real marker + ("they wrod (sic) it\n", 1), + ("they wrod [sick] it\n", 1), + ], +) +def test_ignore_sic( + tmpdir: pytest.TempPathFactory, + capsys: pytest.CaptureFixture[str], + content: str, + expected_error_count: int, +) -> None: + d = str(tmpdir) + with open(op.join(d, "bad.txt"), "w", encoding="utf-8") as f: + f.write(content) + # off by default + assert cs.main(d) == content.count("wrod") + content.count("abilty") + # opt-in + assert cs.main("--ignore-sic", d) == expected_error_count + + @pytest.mark.parametrize( ("content", "expected_error_count"), [