Skip to content

Commit 6f0e77b

Browse files
bzoraclerpre-commit-ci[bot]ilevkivskyi
authored
Allow nativeparse to parse source code directly (#21260)
This is the mypy counterpart of mypyc/ast_serialize#54 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ivan Levkivskyi <levkivskyi@gmail.com>
1 parent ab8e4bf commit 6f0e77b

8 files changed

Lines changed: 123 additions & 123 deletions

File tree

misc/dump-ast.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def dump(fname: str, python_version: tuple[int, int], quiet: bool = False) -> No
1919
options.python_version = python_version
2020
with open(fname, "rb") as f:
2121
s = f.read()
22-
tree = parse(s, fname, None, errors=Errors(options), options=options, file_exists=True)
22+
tree = parse(s, fname, None, errors=Errors(options), options=options)
2323
if not quiet:
2424
print(tree)
2525

mypy/build.py

Lines changed: 65 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1024,85 +1024,76 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None:
10241024
self.post_parse_all(states)
10251025
return
10261026

1027-
sequential_states = []
10281027
parallel_states = []
10291028
for state in states:
1029+
if not self.fscache.exists(state.xpath, real_only=True):
1030+
state.source = state.get_source()
10301031
if state.tree is not None:
10311032
# The file was already parsed.
1032-
continue
1033-
if not self.fscache.exists(state.xpath, real_only=True):
1034-
# New parser only supports parsing on-disk files.
1035-
sequential_states.append(state)
1033+
state.needs_parse = False
10361034
continue
10371035
parallel_states.append(state)
1036+
10381037
if len(parallel_states) > 1:
1039-
self.parse_parallel(sequential_states, parallel_states)
1040-
else:
1041-
# Avoid using executor when there is no parallelism.
1042-
for state in states:
1043-
state.parse_file()
1044-
if post_parse:
1045-
self.post_parse_all(states)
1038+
# This duplicates a bit of logic from State.parse_file(). This is done as an
1039+
# optimization to parallelize only those parts of the code that can be
1040+
# parallelized efficiently.
10461041

1047-
def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None:
1048-
"""Perform parallel parsing of states.
1042+
parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
1043+
parallel_states
1044+
)
10491045

1050-
Note: this duplicates a bit of logic from State.parse_file(). This is done
1051-
as an optimization to parallelize only those parts of the code that can be
1052-
parallelized efficiently.
1053-
"""
1054-
parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
1055-
sequential_states, parallel_states
1056-
)
1046+
for state in parallel_parsed_states:
1047+
# New parser only returns serialized ASTs
1048+
with state.wrap_context():
1049+
assert state.tree is not None
1050+
raw_data = state.tree.raw_data
1051+
if raw_data is not None:
1052+
# Apply inline mypy config before deserialization, since
1053+
# some options (e.g. implicit_optional) affect how the
1054+
# AST is built during deserialization.
1055+
state.source_hash = raw_data.source_hash
1056+
state.apply_inline_configuration(raw_data.mypy_comments)
1057+
state.tree = load_from_raw(
1058+
state.xpath,
1059+
state.id,
1060+
raw_data,
1061+
self.errors,
1062+
state.options,
1063+
imports_only=bool(self.workers),
1064+
)
1065+
if self.errors.is_blockers():
1066+
self.log("Bailing due to parse errors")
1067+
self.errors.raise_error()
10571068

1058-
for state in parallel_parsed_states:
1059-
# New parser returns serialized ASTs. Deserialize full trees only if not using
1060-
# parallel workers.
1061-
with state.wrap_context():
1069+
for state in parallel_states:
10621070
assert state.tree is not None
1063-
raw_data = state.tree.raw_data
1064-
if raw_data is not None:
1065-
# Apply inline mypy config before deserialization, since
1066-
# some options (e.g. implicit_optional) affect deserialization
1067-
state.source_hash = raw_data.source_hash
1068-
state.apply_inline_configuration(raw_data.mypy_comments)
1069-
state.tree = load_from_raw(
1070-
state.xpath,
1071-
state.id,
1072-
raw_data,
1073-
self.errors,
1074-
state.options,
1075-
imports_only=bool(self.workers),
1076-
)
1077-
if self.errors.is_blockers():
1078-
self.log("Bailing due to parse errors")
1079-
self.errors.raise_error()
1080-
1081-
for state in parallel_states:
1082-
assert state.tree is not None
1083-
if state in parallel_parsed_states_set:
1071+
if state in parallel_parsed_states_set:
1072+
if state.tree.raw_data is not None:
1073+
# source_hash was already extracted above, but raw_data
1074+
# may have been preserved for workers (imports_only=True).
1075+
pass
1076+
elif state.source_hash is None:
1077+
# At least namespace packages may not have source.
1078+
state.get_source()
1079+
state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
1080+
state.semantic_analysis_pass1()
1081+
self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash)
1082+
self.modules[state.id] = state.tree
10841083
if state.tree.raw_data is not None:
1085-
# source_hash was already extracted above, but raw_data
1086-
# may have been preserved for workers (imports_only=True).
1087-
pass
1088-
elif state.source_hash is None:
1089-
# At least namespace packages may not have source.
1090-
state.get_source()
1091-
state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
1092-
state.semantic_analysis_pass1()
1093-
self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash)
1094-
self.modules[state.id] = state.tree
1095-
if state.tree.raw_data is not None:
1096-
state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT
1097-
state.check_blockers()
1098-
state.setup_errors()
1099-
1100-
def parse_files_threaded_raw(
1101-
self, sequential_states: list[State], parallel_states: list[State]
1102-
) -> tuple[list[State], set[State]]:
1103-
"""Parse files using a thread pool.
1104-
1105-
Also parse sequential states while waiting for the parallel results.
1084+
state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT
1085+
state.check_blockers()
1086+
state.setup_errors()
1087+
elif len(parallel_states) == 1:
1088+
# Avoid using executor when there is no parallelism.
1089+
parallel_states[0].parse_file()
1090+
1091+
if post_parse:
1092+
self.post_parse_all(states)
1093+
1094+
def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], set[State]]:
1095+
"""Parse files in parallel using a thread pool.
1096+
11061097
Trees from the new parser are left in raw (serialized) form.
11071098
11081099
Return (list, set) of states that were actually parsed (not cached).
@@ -1118,25 +1109,21 @@ def parse_files_threaded_raw(
11181109
# parse_file_inner() results in no visible improvement with more than 8 threads.
11191110
# TODO: reuse thread pool and/or batch small files in single submit() call.
11201111
with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor:
1121-
for state in parallel_states:
1112+
for state in states:
11221113
state.needs_parse = False
11231114
if state.id not in self.ast_cache:
11241115
self.log(f"Parsing {state.xpath} ({state.id})")
11251116
ignore_errors = state.ignore_all or state.options.ignore_errors
11261117
if ignore_errors:
11271118
self.errors.ignored_files.add(state.xpath)
1128-
futures.append(executor.submit(state.parse_file_inner, ""))
1119+
futures.append(executor.submit(state.parse_file_inner, state.source))
11291120
parallel_parsed_states.append(state)
11301121
parallel_parsed_states_set.add(state)
11311122
else:
11321123
self.log(f"Using cached AST for {state.xpath} ({state.id})")
11331124
state.tree, state.early_errors, source_hash = self.ast_cache[state.id]
11341125
state.source_hash = source_hash
11351126

1136-
# Parse sequential before waiting on parallel.
1137-
for state in sequential_states:
1138-
state.parse_file()
1139-
11401127
for fut in wait(futures).done:
11411128
fut.result()
11421129

@@ -1279,21 +1266,20 @@ def parse_file(
12791266
self,
12801267
id: str,
12811268
path: str,
1282-
source: str,
1269+
source: str | None,
12831270
options: Options,
12841271
raw_data: FileRawData | None = None,
12851272
) -> MypyFile:
12861273
"""Parse the source of a file with the given name.
12871274
12881275
Raise CompileError if there is a parse error.
12891276
"""
1290-
file_exists = self.fscache.exists(path, real_only=True)
12911277
t0 = time.time()
12921278
if raw_data:
12931279
# If possible, deserialize from known binary data instead of parsing from scratch.
12941280
tree = load_from_raw(path, id, raw_data, self.errors, options)
12951281
else:
1296-
tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists)
1282+
tree = parse(source, path, id, self.errors, options=options)
12971283
tree._fullname = id
12981284
if self.stats_enabled:
12991285
with self.stats_lock:
@@ -3179,7 +3165,7 @@ def get_source(self) -> str:
31793165
else:
31803166
err = f"{self.path}: error: Cannot decode file: {str(decodeerr)}"
31813167
raise CompileError([err], module_with_blocker=self.id) from decodeerr
3182-
elif self.path and self.manager.fscache.isdir(self.path):
3168+
elif self.path and manager.fscache.isdir(self.path):
31833169
source = ""
31843170
self.source_hash = ""
31853171
else:
@@ -3192,7 +3178,7 @@ def get_source(self) -> str:
31923178
self.time_spent_us += time_spent_us(t0)
31933179
return source
31943180

3195-
def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None:
3181+
def parse_file_inner(self, source: str | None, raw_data: FileRawData | None = None) -> None:
31963182
t0 = time_ref()
31973183
self.tree = self.manager.parse_file(
31983184
self.id, self.xpath, source, options=self.options, raw_data=raw_data
@@ -3319,9 +3305,7 @@ def semantic_analysis_pass1(self) -> None:
33193305
#
33203306
# TODO: This should not be considered as a semantic analysis
33213307
# pass -- it's an independent pass.
3322-
if not options.native_parser or not self.manager.fscache.exists(
3323-
self.xpath, real_only=True
3324-
):
3308+
if not options.native_parser:
33253309
analyzer = SemanticAnalyzerPreAnalysis()
33263310
with self.wrap_context():
33273311
analyzer.visit_file(self.tree, self.xpath, self.id, options)

mypy/checkstrformat.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,6 @@ def apply_field_accessors(
587587
module=None,
588588
options=self.chk.options,
589589
errors=temp_errors,
590-
file_exists=False,
591590
eager=True,
592591
)
593592
if temp_errors.is_errors():

mypy/nativeparse.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,10 @@ def add_error(
182182

183183

184184
def native_parse(
185-
filename: str, options: Options, skip_function_bodies: bool = False
185+
filename: str,
186+
options: Options,
187+
source: str | bytes | None = None,
188+
skip_function_bodies: bool = False,
186189
) -> tuple[MypyFile, list[ParseError], TypeIgnores]:
187190
"""Parse a Python file using the native Rust-based parser.
188191
@@ -211,7 +214,7 @@ def native_parse(
211214
uses_template_strings,
212215
source_hash,
213216
mypy_comments,
214-
) = parse_to_binary_ast(filename, options, skip_function_bodies)
217+
) = parse_to_binary_ast(filename, options, source, skip_function_bodies)
215218
node = MypyFile([], [])
216219
node.path = filename
217220
node.raw_data = FileRawData(
@@ -248,7 +251,10 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]:
248251

249252

250253
def parse_to_binary_ast(
251-
filename: str, options: Options, skip_function_bodies: bool = False
254+
filename: str,
255+
options: Options,
256+
source: str | bytes | None = None,
257+
skip_function_bodies: bool = False,
252258
) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]:
253259
# This is a horrible hack to work around a mypyc bug where imported
254260
# module may be not ready in a thread sometimes.
@@ -259,6 +265,7 @@ def parse_to_binary_ast(
259265
raise ImportError("Cannot import ast_serialize")
260266
ast_bytes, errors, ignores, import_bytes, ast_data = ast_serialize.parse(
261267
filename,
268+
source,
262269
skip_function_bodies=skip_function_bodies,
263270
python_version=options.python_version,
264271
platform=options.platform,

mypy/parse.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,11 @@
1212

1313

1414
def parse(
15-
source: str | bytes,
15+
source: str | bytes | None,
1616
fnam: str,
1717
module: str | None,
1818
errors: Errors,
1919
options: Options,
20-
file_exists: bool,
2120
eager: bool = False,
2221
) -> MypyFile:
2322
"""Parse a source file, without doing any semantic analysis.
@@ -27,28 +26,29 @@ def parse(
2726
2827
New parser returns empty tree with serialized data. To get the full tree and
2928
the parse errors, use eager=True.
29+
30+
`source` must not be `None` if the old parser is used. The new parser will read and
31+
parse contents from path `fnam` if `source` is `None`.
3032
"""
3133
if options.native_parser:
32-
# Native parser only works with actual files on disk
33-
# Fall back to fastparse for in-memory source or non-existent files
34-
if file_exists:
35-
import mypy.nativeparse
36-
37-
ignore_errors = options.ignore_errors or fnam in errors.ignored_files
38-
# If errors are ignored, we can drop many function bodies to speed up type checking.
39-
strip_function_bodies = ignore_errors and not options.preserve_asts
40-
tree, _, _ = mypy.nativeparse.native_parse(
41-
fnam, options, skip_function_bodies=strip_function_bodies
42-
)
43-
# Set is_stub based on file extension
44-
tree.is_stub = fnam.endswith(".pyi")
45-
# Note: tree.imports is populated directly by load_from_raw() with deserialized
46-
# import metadata, so we don't need to collect imports via AST traversal
47-
if eager and tree.raw_data is not None:
48-
tree = load_from_raw(fnam, module, tree.raw_data, errors, options)
49-
return tree
50-
# Fall through to fastparse for non-existent files
51-
34+
import mypy.nativeparse
35+
36+
ignore_errors = options.ignore_errors or fnam in errors.ignored_files
37+
# If errors are ignored, we can drop many function bodies to speed up type checking.
38+
strip_function_bodies = ignore_errors and not options.preserve_asts
39+
tree, _, _ = mypy.nativeparse.native_parse(
40+
fnam, options, source, skip_function_bodies=strip_function_bodies
41+
)
42+
# Set is_stub based on file extension
43+
tree.is_stub = fnam.endswith(".pyi")
44+
# Note: tree.imports is populated directly by load_from_raw() with deserialized
45+
# import metadata, so we don't need to collect imports via AST traversal
46+
if eager and tree.raw_data is not None:
47+
tree = load_from_raw(fnam, module, tree.raw_data, errors, options)
48+
return tree
49+
50+
if source is None:
51+
raise ValueError("Source cannot be `None` when using the old parser")
5252
if options.transform_source is not None:
5353
source = options.transform_source(source)
5454
import mypy.fastparse

mypy/stubgen.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1745,13 +1745,7 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None:
17451745
source = mypy.util.decode_python_encoding(data)
17461746
errors = Errors(mypy_options)
17471747
mod.ast = mypy.parse.parse(
1748-
source,
1749-
fnam=mod.path,
1750-
module=mod.module,
1751-
errors=errors,
1752-
options=mypy_options,
1753-
file_exists=True,
1754-
eager=True,
1748+
source, fnam=mod.path, module=mod.module, errors=errors, options=mypy_options, eager=True
17551749
)
17561750
mod.ast._fullname = mod.module
17571751
if errors.is_blockers():

0 commit comments

Comments
 (0)