Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e67c125
Added ruff to dev dependencies
Dec 10, 2025
06cbc7b
Added ruff settings to pyproject.toml as in docling
Dec 10, 2025
3035504
Cleanup uf pyproject.toml
Dec 10, 2025
e4fac05
Copied settings for ruff pre-commit hooks from docling
Dec 10, 2025
38827c5
Excluded test/data/** from ruff formatting / linting
Dec 10, 2025
ffbe537
ruff format
Dec 10, 2025
81b3f8b
Added some ignore statements to pyproject.toml such that ruff check r…
Dec 10, 2025
260478f
ruff check --fix
Dec 10, 2025
8db072c
Ignored some more rules
Dec 10, 2025
1a77021
Fixed the rest of the errors that would only concern 1 - 3 files
Dec 10, 2025
7e6ef81
Added another ignore related to df for DataFrame names
Dec 10, 2025
85a1889
Modified CONTRIBUTING.md such that black / isort are replaced by ruff
Dec 10, 2025
1607485
Added UP045 to ignore list such that Optional[...] does not raise
Dec 12, 2025
e675fd5
Moved .flake8 configs to pyproject.toml
Dec 12, 2025
5797275
Moved autoflake to be used with ruff
Dec 12, 2025
24d6497
Moved all .flake8 settings to pyproject.toml to be compatible with ru…
Dec 12, 2025
16468a4
Removed flake8 from .pre-commit hooks
Dec 12, 2025
398e949
Applied ruff format (again); formatted some files as the line-length …
Dec 12, 2025
05ed475
Set max-complexity to 30 (as was originally) in the pyproject.toml as…
Dec 12, 2025
aa04979
Adding PD901 to ignore list such that pre-commit hooks run fully again
Dec 12, 2025
7b6da2d
Replaced dtype | None syntax by Optional[dtype] in remaining places
Dec 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions .flake8

This file was deleted.

43 changes: 11 additions & 32 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,29 +1,16 @@
fail_fast: true
repos:
- repo: local
hooks:
- id: black
name: Black
entry: uv run --no-sync black docling_core test
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: isort
name: isort
entry: uv run --no-sync isort docling_core test
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: autoflake
name: autoflake
entry: uv run --no-sync autoflake docling_core test
pass_filenames: false
language: system
files: '\.py$'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.5
hooks:
- id: ruff-format
name: "Ruff formatter"
args: [--config=pyproject.toml]
files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
- id: ruff
name: "Ruff linter"
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
- repo: local
hooks:
- id: mypy
Expand All @@ -32,14 +19,6 @@ repos:
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: flake8
name: Flake8
entry: uv run --no-sync flake8 docling_core
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: pytest
Expand Down
6 changes: 1 addition & 5 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ uv add [OPTIONS] <PACKAGES|--requirements <REQUIREMENTS>>

We use the following tools to enforce code style:

- isort, to sort imports
- Black, to format code
- Ruff, to format and lint code
- Flake8, to lint code
- autoflake, to remove unused variables and imports
- [MyPy](https://mypy.readthedocs.io), as static type checker
Expand All @@ -65,9 +64,6 @@ To run the checks on-demand, type:
uv run pre-commit run --all-files
```

Note: Checks like `Black` and `isort` will _fail_ if they modify files. This is because `pre-commit` doesn't like to see files modified by their hooks. In these cases, `git add` the modified files and `git commit` again.


### Documentation

We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Docling objects.
Expand Down
51 changes: 12 additions & 39 deletions docling_core/experimental/idoctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def get_special_tokens(

if include_location_tokens:
# Adding dynamically generated location-tokens
for i in range(0, max(page_dimension[0], page_dimension[1])):
for i in range(max(page_dimension[0], page_dimension[1])):
special_tokens.append(f"<{IDocTagsToken._LOC_PREFIX.value}{i}/>")

return special_tokens
Expand Down Expand Up @@ -294,11 +294,7 @@ def serialize(
# as siblings at the same level (not wrapped in <list_item>).
for subref in child.children:
sub = subref.resolve(doc)
if (
isinstance(sub, ListGroup)
and sub.self_ref not in my_visited
and sub.self_ref not in excluded
):
if isinstance(sub, ListGroup) and sub.self_ref not in my_visited and sub.self_ref not in excluded:
my_visited.add(sub.self_ref)
sub_res = doc_serializer.serialize(
item=sub,
Expand Down Expand Up @@ -343,15 +339,9 @@ def serialize(
texts = (
[
tmp
for key in (
list(item.meta.__class__.model_fields)
+ list(item.meta.get_custom_part())
)
for key in (list(item.meta.__class__.model_fields) + list(item.meta.get_custom_part()))
if (
(
params.allowed_meta_names is None
or key in params.allowed_meta_names
)
(params.allowed_meta_names is None or key in params.allowed_meta_names)
and (key not in params.blocked_meta_names)
and (tmp := self._serialize_meta_field(item.meta, key))
)
Expand All @@ -369,28 +359,16 @@ def serialize(

def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
if (field_val := getattr(meta, name)) is not None:
if name == MetaFieldName.SUMMARY and isinstance(
field_val, SummaryMetaField
):
if name == MetaFieldName.SUMMARY and isinstance(field_val, SummaryMetaField):
txt = f"<summary>{field_val.text}</summary>"
elif name == MetaFieldName.DESCRIPTION and isinstance(
field_val, DescriptionMetaField
):
elif name == MetaFieldName.DESCRIPTION and isinstance(field_val, DescriptionMetaField):
txt = f"<description>{field_val.text}</description>"
elif name == MetaFieldName.CLASSIFICATION and isinstance(
field_val, PictureClassificationMetaField
):
class_name = self._humanize_text(
field_val.get_main_prediction().class_name
)
elif name == MetaFieldName.CLASSIFICATION and isinstance(field_val, PictureClassificationMetaField):
class_name = self._humanize_text(field_val.get_main_prediction().class_name)
txt = f"<classification>{class_name}</classification>"
elif name == MetaFieldName.MOLECULE and isinstance(
field_val, MoleculeMetaField
):
elif name == MetaFieldName.MOLECULE and isinstance(field_val, MoleculeMetaField):
txt = f"<molecule>{field_val.smi}</molecule>"
elif name == MetaFieldName.TABULAR_CHART and isinstance(
field_val, TabularChartMetaField
):
elif name == MetaFieldName.TABULAR_CHART and isinstance(field_val, TabularChartMetaField):
# suppressing tabular chart serialization
return None
# elif tmp := str(field_val or ""):
Expand Down Expand Up @@ -419,7 +397,6 @@ def serialize(
is_chart = False

if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):

if item.meta:
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
if meta_res.text:
Expand Down Expand Up @@ -508,12 +485,8 @@ def serialize_doc(

text_res = tmp

if self.params.pretty_indentation and (
my_root := parseString(text_res).documentElement
):
if self.params.pretty_indentation and (my_root := parseString(text_res).documentElement):
text_res = my_root.toprettyxml(indent=self.params.pretty_indentation)
text_res = "\n".join(
[line for line in text_res.split("\n") if line.strip()]
)
text_res = "\n".join([line for line in text_res.split("\n") if line.strip()])

return create_ser_result(text=text_res, span_source=parts)
17 changes: 3 additions & 14 deletions docling_core/search/json_schema_to_search_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,9 +269,7 @@ def __suppress(d_: Any) -> Any:
if suppress_key in d_ and d_[suppress_key] is True:
return {}
else:
return {
k: v for k, v in ((k, __suppress(v)) for k, v in d_.items())
}
return {k: v for k, v in ((k, __suppress(v)) for k, v in d_.items())}
return d_

return __suppress(doc)
Expand Down Expand Up @@ -325,12 +323,7 @@ def __remove(d_: Any) -> Any:
return [v for v in (__remove(v) for v in d_)]

if isinstance(d_, dict):
return {
k: v
for k, v in (
(k, __remove(v)) for k, v in d_.items() if not regx.match(k)
)
}
return {k: v for k, v in ((k, __remove(v)) for k, v in d_.items() if not regx.match(k))}

return d_

Expand Down Expand Up @@ -393,11 +386,7 @@ def _clean(d_: Any) -> Any:
return [v for v in (_clean(v) for v in d_) if not _empty(v)]

if isinstance(d_, dict):
return {
k: v
for k, v in ((k, _clean(v)) for k, v in d_.items())
if not _empty(v)
}
return {k: v for k, v in ((k, _clean(v)) for k, v in d_.items()) if not _empty(v)}

return d_

Expand Down
8 changes: 2 additions & 6 deletions docling_core/search/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,8 @@ def version_has_schema(cls, v):
"""Validate that the docling-core library is always set in version field."""
docling_core = [item for item in v if item.name == "docling-core"]
if not docling_core:
raise ValueError(
"the version should include at least a valid docling-core package"
)
raise ValueError("the version should include at least a valid docling-core package")
elif len(docling_core) > 1:
raise ValueError(
"the version must not include more than 1 docling-core package"
)
raise ValueError("the version must not include more than 1 docling-core package")
else:
return v
4 changes: 2 additions & 2 deletions docling_core/search/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class Package(BaseModel, extra="forbid"):
"""

name: StrictStr = "docling-core"
version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = (
importlib.metadata.version("docling-core")
version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = importlib.metadata.version(
"docling-core"
)

def __hash__(self):
Expand Down
9 changes: 1 addition & 8 deletions docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,7 @@ def contextualize(self, chunk: BaseChunk) -> str:
for k in meta:
if k not in chunk.meta.excluded_embed:
if isinstance(meta[k], list):
items.append(
self.delim.join(
[
d if isinstance(d, str) else json.dumps(d)
for d in meta[k]
]
)
)
items.append(self.delim.join([d if isinstance(d, str) else json.dumps(d) for d in meta[k]]))
else:
items.append(json.dumps(meta[k]))
items.append(chunk.text)
Expand Down
Loading