diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 6e3662a8b7..cc14fcc22c 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -23,7 +23,19 @@ import re import sys import textwrap -from typing import Any, Dict, List, Match, Optional, Pattern, Sequence, Set, Tuple +from multiprocessing import Pool +from typing import ( + Any, + Dict, + Generator, + List, + Match, + Optional, + Pattern, + Sequence, + Set, + Tuple, +) # autogenerated by setuptools_scm from ._version import __version__ as VERSION # type: ignore # noqa: N812 @@ -455,6 +467,20 @@ def parse_options( "should match the to-be-excluded lines exactly", ) + parser.add_argument( + "-J", + "--jobs", + action="store", + type=int, + default=0, + help="set number of jobs to parallelize processing - one " + "subprocess per file:\n" + "- 0: no parallelization (default)" + "- positive integer: number of sub-processes to use\n" + "- -1: use all available CPUs\n" + "Interactive mode is not compatible with parallel processing", + ) + parser.add_argument( "-i", "--interactive", @@ -1021,12 +1047,58 @@ def parse_file( return bad_count +class _FileParser: + """A helper class to provide top level closure for parse_file()""" + + def __init__( + self, + colors: TermColors, + summary: Optional[Summary], + misspellings: Dict[str, Misspelling], + exclude_lines: Set[str], + file_opener: FileOpener, + word_regex: Pattern[str], + ignore_word_regex: Optional[Pattern[str]], + uri_regex: Pattern[str], + uri_ignore_words: Set[str], + context: Optional[Tuple[int, int]], + options: argparse.Namespace, + ) -> None: + self.colors = colors + self.summary = summary + self.misspellings = misspellings + self.exclude_lines = exclude_lines + self.file_opener = file_opener + self.word_regex = word_regex + self.ignore_word_regex = ignore_word_regex + self.uri_regex = uri_regex + self.uri_ignore_words = uri_ignore_words + self.context = context + self.options = options + + def __call__(self, filename: str) -> int: + return parse_file( + filename, + self.colors, + self.summary, + self.misspellings, + self.exclude_lines, + self.file_opener, + self.word_regex, + self.ignore_word_regex, + self.uri_regex, + self.uri_ignore_words, + self.context, + self.options, + ) + + def _script_main() -> int: """Wrap to main() for setuptools.""" return main(*sys.argv[1:]) -def main(*args: str) -> int: +def main(*args: str) -> int: # noqa: C901,PLR0915,PLR0911 """Contains flow control""" try: options, parser, used_cfg_files = parse_options(args) @@ -1138,6 +1210,25 @@ def main(*args: str) -> int: else: summary = None + if options.jobs and options.interactive: + print( + "ERROR: do not enable parallelization in interactive mode", + file=sys.stderr, + ) + # no point to parser.print_help() - just hides ERROR away here + return EX_USAGE + + jobs = options.jobs + if jobs == -1: + jobs = os.cpu_count() + elif jobs < -1: + print( + f"ERROR: invalid number of jobs: {jobs}", + file=sys.stderr, + ) + parser.print_help() + return EX_USAGE + context = None if options.context is not None: if (options.before_context is not None) or (options.after_context is not None): @@ -1176,66 +1267,68 @@ def main(*args: str) -> int: ) return EX_USAGE - bad_count = 0 - for filename in sorted(options.files): - # ignore hidden files - if is_hidden(filename, options.check_hidden): - continue - - if os.path.isdir(filename): - for root, dirs, files in os.walk(filename): - if glob_match.match(root): # skip (absolute) directories - dirs.clear() - continue - if is_hidden(root, options.check_hidden): # dir itself hidden - continue - for file_ in sorted(files): - # ignore hidden files in directories - if is_hidden(file_, options.check_hidden): - continue - if glob_match.match(file_): # skip files + def _find_files() -> Generator[str, None, None]: + """Yields filename for the parsing""" + for filename in sorted(options.files): + # ignore hidden files + if is_hidden(filename, options.check_hidden): + continue + + if os.path.isdir(filename): + for root, dirs, files in os.walk(filename): + if glob_match.match(root): # skip (absolute) directories + dirs.clear() continue - fname = os.path.join(root, file_) - if glob_match.match(fname): # skip paths + if is_hidden(root, options.check_hidden): # dir itself hidden continue - bad_count += parse_file( - fname, - colors, - summary, - misspellings, - exclude_lines, - file_opener, - word_regex, - ignore_word_regex, - uri_regex, - uri_ignore_words, - context, - options, - ) + for file_ in sorted(files): + # ignore hidden files in directories + if is_hidden(file_, options.check_hidden): + continue + if glob_match.match(file_): # skip files + continue + fname = os.path.join(root, file_) + if glob_match.match(fname): # skip paths + continue + yield fname + + # skip (relative) directories + dirs[:] = [ + dir_ + for dir_ in dirs + if not glob_match.match(dir_) + and not is_hidden(dir_, options.check_hidden) + ] + + elif not glob_match.match(filename): # skip files + yield filename + + # closure to pass only relevant to the job filename + file_parser = _FileParser( + colors, + summary, + misspellings, + exclude_lines, + file_opener, + word_regex, + ignore_word_regex, + uri_regex, + uri_ignore_words, + context, + options, + ) - # skip (relative) directories - dirs[:] = [ - dir_ - for dir_ in dirs - if not glob_match.match(dir_) - and not is_hidden(dir_, options.check_hidden) - ] - - elif not glob_match.match(filename): # skip files - bad_count += parse_file( - filename, - colors, - summary, - misspellings, - exclude_lines, - file_opener, - word_regex, - ignore_word_regex, - uri_regex, - uri_ignore_words, - context, - options, - ) + if jobs: + # parse_file would be in subprocess(es) + with Pool(jobs) as pool: + results = pool.map(file_parser, _find_files()) + for result in results: + if isinstance(result, Exception): + raise result + bad_count = sum(results) + else: + # serial + bad_count = sum(map(file_parser, _find_files())) if summary: print("\n-------8<-------\nSUMMARY:")