Allow nativeparse to parse source code directly (#21260)

bzoracler · pre-commit-ci[bot] · ilevkivskyi · web-flow · commit 6f0e77b85910 · 2026-05-21T01:31:25.000+01:00
This is the mypy counterpart of mypyc/ast_serialize#54 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ivan Levkivskyi <levkivskyi@gmail.com>
diff --git a/misc/dump-ast.py b/misc/dump-ast.py
@@ -19,7 +19,7 @@ def dump(fname: str, python_version: tuple[int, int], quiet: bool = False) -> No
     options.python_version = python_version
     with open(fname, "rb") as f:
         s = f.read()
-        tree = parse(s, fname, None, errors=Errors(options), options=options, file_exists=True)
+        tree = parse(s, fname, None, errors=Errors(options), options=options)
         if not quiet:
             print(tree)
 
diff --git a/mypy/build.py b/mypy/build.py
@@ -1024,85 +1024,76 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None:
                 self.post_parse_all(states)
             return
 
-        sequential_states = []
         parallel_states = []
         for state in states:
+            if not self.fscache.exists(state.xpath, real_only=True):
+                state.source = state.get_source()
             if state.tree is not None:
                 # The file was already parsed.
-                continue
-            if not self.fscache.exists(state.xpath, real_only=True):
-                # New parser only supports parsing on-disk files.
-                sequential_states.append(state)
+                state.needs_parse = False
                 continue
             parallel_states.append(state)
+
         if len(parallel_states) > 1:
-            self.parse_parallel(sequential_states, parallel_states)
-        else:
-            # Avoid using executor when there is no parallelism.
-            for state in states:
-                state.parse_file()
-        if post_parse:
-            self.post_parse_all(states)
+            # This duplicates a bit of logic from State.parse_file(). This is done as an
+            # optimization to parallelize only those parts of the code that can be
+            # parallelized efficiently.
 
-    def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None:
-        """Perform parallel parsing of states.
+            parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
+                parallel_states
+            )
 
-        Note: this duplicates a bit of logic from State.parse_file(). This is done
-        as an optimization to parallelize only those parts of the code that can be
-        parallelized efficiently.
-        """
-        parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
-            sequential_states, parallel_states
-        )
+            for state in parallel_parsed_states:
+                # New parser only returns serialized ASTs
+                with state.wrap_context():
+                    assert state.tree is not None
+                    raw_data = state.tree.raw_data
+                    if raw_data is not None:
+                        # Apply inline mypy config before deserialization, since
+                        # some options (e.g. implicit_optional) affect how the
+                        # AST is built during deserialization.
+                        state.source_hash = raw_data.source_hash
+                        state.apply_inline_configuration(raw_data.mypy_comments)
+                        state.tree = load_from_raw(
+                            state.xpath,
+                            state.id,
+                            raw_data,
+                            self.errors,
+                            state.options,
+                            imports_only=bool(self.workers),
+                        )
+                    if self.errors.is_blockers():
+                        self.log("Bailing due to parse errors")
+                        self.errors.raise_error()
 
-        for state in parallel_parsed_states:
-            # New parser returns serialized ASTs. Deserialize full trees only if not using
-            # parallel workers.
-            with state.wrap_context():
+            for state in parallel_states:
                 assert state.tree is not None
-                raw_data = state.tree.raw_data
-                if raw_data is not None:
-                    # Apply inline mypy config before deserialization, since
-                    # some options (e.g. implicit_optional) affect deserialization
-                    state.source_hash = raw_data.source_hash
-                    state.apply_inline_configuration(raw_data.mypy_comments)
-                    state.tree = load_from_raw(
-                        state.xpath,
-                        state.id,
-                        raw_data,
-                        self.errors,
-                        state.options,
-                        imports_only=bool(self.workers),
-                    )
-                if self.errors.is_blockers():
-                    self.log("Bailing due to parse errors")
-                    self.errors.raise_error()
-
-        for state in parallel_states:
-            assert state.tree is not None
-            if state in parallel_parsed_states_set:
+                if state in parallel_parsed_states_set:
+                    if state.tree.raw_data is not None:
+                        # source_hash was already extracted above, but raw_data
+                        # may have been preserved for workers (imports_only=True).
+                        pass
+                    elif state.source_hash is None:
+                        # At least namespace packages may not have source.
+                        state.get_source()
+                    state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
+                    state.semantic_analysis_pass1()
+                    self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash)
+                self.modules[state.id] = state.tree
                 if state.tree.raw_data is not None:
-                    # source_hash was already extracted above, but raw_data
-                    # may have been preserved for workers (imports_only=True).
-                    pass
-                elif state.source_hash is None:
-                    # At least namespace packages may not have source.
-                    state.get_source()
-                state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
-                state.semantic_analysis_pass1()
-                self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash)
-            self.modules[state.id] = state.tree
-            if state.tree.raw_data is not None:
-                state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT
-            state.check_blockers()
-            state.setup_errors()
-
-    def parse_files_threaded_raw(
-        self, sequential_states: list[State], parallel_states: list[State]
-    ) -> tuple[list[State], set[State]]:
-        """Parse files using a thread pool.
-
-        Also parse sequential states while waiting for the parallel results.
+                    state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT
+                state.check_blockers()
+                state.setup_errors()
+        elif len(parallel_states) == 1:
+            # Avoid using executor when there is no parallelism.
+            parallel_states[0].parse_file()
+
+        if post_parse:
+            self.post_parse_all(states)
+
+    def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], set[State]]:
+        """Parse files in parallel using a thread pool.
+
         Trees from the new parser are left in raw (serialized) form.
 
         Return (list, set) of states that were actually parsed (not cached).
@@ -1118,25 +1109,21 @@ def parse_files_threaded_raw(
         # parse_file_inner() results in no visible improvement with more than 8 threads.
         # TODO: reuse thread pool and/or batch small files in single submit() call.
         with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor:
-            for state in parallel_states:
+            for state in states:
                 state.needs_parse = False
                 if state.id not in self.ast_cache:
                     self.log(f"Parsing {state.xpath} ({state.id})")
                     ignore_errors = state.ignore_all or state.options.ignore_errors
                     if ignore_errors:
                         self.errors.ignored_files.add(state.xpath)
-                    futures.append(executor.submit(state.parse_file_inner, ""))
+                    futures.append(executor.submit(state.parse_file_inner, state.source))
                     parallel_parsed_states.append(state)
                     parallel_parsed_states_set.add(state)
                 else:
                     self.log(f"Using cached AST for {state.xpath} ({state.id})")
                     state.tree, state.early_errors, source_hash = self.ast_cache[state.id]
                     state.source_hash = source_hash
 
-            # Parse sequential before waiting on parallel.
-            for state in sequential_states:
-                state.parse_file()
-
             for fut in wait(futures).done:
                 fut.result()
 
@@ -1279,21 +1266,20 @@ def parse_file(
         self,
         id: str,
         path: str,
-        source: str,
+        source: str | None,
         options: Options,
         raw_data: FileRawData | None = None,
     ) -> MypyFile:
         """Parse the source of a file with the given name.
 
         Raise CompileError if there is a parse error.
         """
-        file_exists = self.fscache.exists(path, real_only=True)
         t0 = time.time()
         if raw_data:
             # If possible, deserialize from known binary data instead of parsing from scratch.
             tree = load_from_raw(path, id, raw_data, self.errors, options)
         else:
-            tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists)
+            tree = parse(source, path, id, self.errors, options=options)
         tree._fullname = id
         if self.stats_enabled:
             with self.stats_lock:
@@ -3179,7 +3165,7 @@ def get_source(self) -> str:
                     else:
                         err = f"{self.path}: error: Cannot decode file: {str(decodeerr)}"
                     raise CompileError([err], module_with_blocker=self.id) from decodeerr
-            elif self.path and self.manager.fscache.isdir(self.path):
+            elif self.path and manager.fscache.isdir(self.path):
                 source = ""
                 self.source_hash = ""
             else:
@@ -3192,7 +3178,7 @@ def get_source(self) -> str:
         self.time_spent_us += time_spent_us(t0)
         return source
 
-    def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None:
+    def parse_file_inner(self, source: str | None, raw_data: FileRawData | None = None) -> None:
         t0 = time_ref()
         self.tree = self.manager.parse_file(
             self.id, self.xpath, source, options=self.options, raw_data=raw_data
@@ -3319,9 +3305,7 @@ def semantic_analysis_pass1(self) -> None:
         #
         # TODO: This should not be considered as a semantic analysis
         #     pass -- it's an independent pass.
-        if not options.native_parser or not self.manager.fscache.exists(
-            self.xpath, real_only=True
-        ):
+        if not options.native_parser:
             analyzer = SemanticAnalyzerPreAnalysis()
             with self.wrap_context():
                 analyzer.visit_file(self.tree, self.xpath, self.id, options)
diff --git a/mypy/checkstrformat.py b/mypy/checkstrformat.py
@@ -587,7 +587,6 @@ def apply_field_accessors(
             module=None,
             options=self.chk.options,
             errors=temp_errors,
-            file_exists=False,
             eager=True,
         )
         if temp_errors.is_errors():
diff --git a/mypy/nativeparse.py b/mypy/nativeparse.py
@@ -182,7 +182,10 @@ def add_error(
 
 
 def native_parse(
-    filename: str, options: Options, skip_function_bodies: bool = False
+    filename: str,
+    options: Options,
+    source: str | bytes | None = None,
+    skip_function_bodies: bool = False,
 ) -> tuple[MypyFile, list[ParseError], TypeIgnores]:
     """Parse a Python file using the native Rust-based parser.
 
@@ -211,7 +214,7 @@ def native_parse(
         uses_template_strings,
         source_hash,
         mypy_comments,
-    ) = parse_to_binary_ast(filename, options, skip_function_bodies)
+    ) = parse_to_binary_ast(filename, options, source, skip_function_bodies)
     node = MypyFile([], [])
     node.path = filename
     node.raw_data = FileRawData(
@@ -248,7 +251,10 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]:
 
 
 def parse_to_binary_ast(
-    filename: str, options: Options, skip_function_bodies: bool = False
+    filename: str,
+    options: Options,
+    source: str | bytes | None = None,
+    skip_function_bodies: bool = False,
 ) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]:
     # This is a horrible hack to work around a mypyc bug where imported
     # module may be not ready in a thread sometimes.
@@ -259,6 +265,7 @@ def parse_to_binary_ast(
             raise ImportError("Cannot import ast_serialize")
     ast_bytes, errors, ignores, import_bytes, ast_data = ast_serialize.parse(
         filename,
+        source,
         skip_function_bodies=skip_function_bodies,
         python_version=options.python_version,
         platform=options.platform,
diff --git a/mypy/parse.py b/mypy/parse.py
@@ -12,12 +12,11 @@
 
 
 def parse(
-    source: str | bytes,
+    source: str | bytes | None,
     fnam: str,
     module: str | None,
     errors: Errors,
     options: Options,
-    file_exists: bool,
     eager: bool = False,
 ) -> MypyFile:
     """Parse a source file, without doing any semantic analysis.
@@ -27,28 +26,29 @@ def parse(
 
     New parser returns empty tree with serialized data. To get the full tree and
     the parse errors, use eager=True.
+
+    `source` must not be `None` if the old parser is used. The new parser will read and
+    parse contents from path `fnam` if `source` is `None`.
     """
     if options.native_parser:
-        # Native parser only works with actual files on disk
-        # Fall back to fastparse for in-memory source or non-existent files
-        if file_exists:
-            import mypy.nativeparse
-
-            ignore_errors = options.ignore_errors or fnam in errors.ignored_files
-            # If errors are ignored, we can drop many function bodies to speed up type checking.
-            strip_function_bodies = ignore_errors and not options.preserve_asts
-            tree, _, _ = mypy.nativeparse.native_parse(
-                fnam, options, skip_function_bodies=strip_function_bodies
-            )
-            # Set is_stub based on file extension
-            tree.is_stub = fnam.endswith(".pyi")
-            # Note: tree.imports is populated directly by load_from_raw() with deserialized
-            # import metadata, so we don't need to collect imports via AST traversal
-            if eager and tree.raw_data is not None:
-                tree = load_from_raw(fnam, module, tree.raw_data, errors, options)
-            return tree
-        # Fall through to fastparse for non-existent files
-
+        import mypy.nativeparse
+
+        ignore_errors = options.ignore_errors or fnam in errors.ignored_files
+        # If errors are ignored, we can drop many function bodies to speed up type checking.
+        strip_function_bodies = ignore_errors and not options.preserve_asts
+        tree, _, _ = mypy.nativeparse.native_parse(
+            fnam, options, source, skip_function_bodies=strip_function_bodies
+        )
+        # Set is_stub based on file extension
+        tree.is_stub = fnam.endswith(".pyi")
+        # Note: tree.imports is populated directly by load_from_raw() with deserialized
+        # import metadata, so we don't need to collect imports via AST traversal
+        if eager and tree.raw_data is not None:
+            tree = load_from_raw(fnam, module, tree.raw_data, errors, options)
+        return tree
+
+    if source is None:
+        raise ValueError("Source cannot be `None` when using the old parser")
     if options.transform_source is not None:
         source = options.transform_source(source)
     import mypy.fastparse
diff --git a/mypy/stubgen.py b/mypy/stubgen.py
@@ -1745,13 +1745,7 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None:
     source = mypy.util.decode_python_encoding(data)
     errors = Errors(mypy_options)
     mod.ast = mypy.parse.parse(
-        source,
-        fnam=mod.path,
-        module=mod.module,
-        errors=errors,
-        options=mypy_options,
-        file_exists=True,
-        eager=True,
+        source, fnam=mod.path, module=mod.module, errors=errors, options=mypy_options, eager=True
     )
     mod.ast._fullname = mod.module
     if errors.is_blockers():
diff --git a/mypy/test/test_nativeparse.py b/mypy/test/test_nativeparse.py
diff --git a/mypy/test/testparse.py b/mypy/test/testparse.py

Original file line number	Diff line number	Diff line change
`@@ -587,7 +587,6 @@ def apply_field_accessors(`
`587`	`587`	`module=None,`
`588`	`588`	`options=self.chk.options,`
`589`	`589`	`errors=temp_errors,`
`590`		`- file_exists=False,`
`591`	`590`	`eager=True,`
`592`	`591`	`)`
`593`	`592`	`if temp_errors.is_errors():`