docling-project · schwarbf · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/.flake8 b/.flake8
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,29 +1,16 @@
 fail_fast: true
 repos:
-  - repo: local
-    hooks:
-      - id: black
-        name: Black
-        entry: uv run --no-sync black docling_core test
-        pass_filenames: false
-        language: system
-        files: '\.py$'
-  - repo: local
-    hooks:
-      - id: isort
-        name: isort
-        entry: uv run --no-sync isort docling_core test
-        pass_filenames: false
-        language: system
-        files: '\.py$'
-  - repo: local
-    hooks:
-      - id: autoflake
-        name: autoflake
-        entry: uv run --no-sync autoflake docling_core test
-        pass_filenames: false
-        language: system
-        files: '\.py$'
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.5
+    hooks:
+      - id: ruff-format
+        name: "Ruff formatter"
+        args: [--config=pyproject.toml]
+        files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
+      - id: ruff
+        name: "Ruff linter"
+        args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
+        files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
   - repo: local
     hooks:
       - id: mypy
@@ -32,14 +19,6 @@ repos:
         pass_filenames: false
         language: system
         files: '\.py$'
-  - repo: local
-    hooks:
-      - id: flake8
-        name: Flake8
-        entry: uv run --no-sync flake8 docling_core
-        pass_filenames: false
-        language: system
-        files: '\.py$'
   - repo: local
     hooks:
       - id: pytest

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -47,8 +47,7 @@ uv add [OPTIONS] <PACKAGES|--requirements <REQUIREMENTS>>
 
 We use the following tools to enforce code style:
 
-- isort, to sort imports
-- Black, to format code
+- Ruff, to format and lint code
 - Flake8, to lint code
 - autoflake, to remove unused variables and imports
 - [MyPy](https://mypy.readthedocs.io), as static type checker
@@ -65,9 +64,6 @@ To run the checks on-demand, type:
 uv run pre-commit run --all-files
 ```
 
-Note: Checks like `Black` and `isort` will _fail_ if they modify files. This is because `pre-commit` doesn't like to see files modified by their hooks. In these cases, `git add` the modified files and `git commit` again.
-
-
 ### Documentation
 
 We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Docling objects.

diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py
@@ -147,7 +147,7 @@ def get_special_tokens(
 
         if include_location_tokens:
             # Adding dynamically generated location-tokens
-            for i in range(0, max(page_dimension[0], page_dimension[1])):
+            for i in range(max(page_dimension[0], page_dimension[1])):
                 special_tokens.append(f"<{IDocTagsToken._LOC_PREFIX.value}{i}/>")
 
         return special_tokens
@@ -294,11 +294,7 @@ def serialize(
             # as siblings at the same level (not wrapped in <list_item>).
             for subref in child.children:
                 sub = subref.resolve(doc)
-                if (
-                    isinstance(sub, ListGroup)
-                    and sub.self_ref not in my_visited
-                    and sub.self_ref not in excluded
-                ):
+                if isinstance(sub, ListGroup) and sub.self_ref not in my_visited and sub.self_ref not in excluded:
                     my_visited.add(sub.self_ref)
                     sub_res = doc_serializer.serialize(
                         item=sub,
@@ -343,15 +339,9 @@ def serialize(
         texts = (
             [
                 tmp
-                for key in (
-                    list(item.meta.__class__.model_fields)
-                    + list(item.meta.get_custom_part())
-                )
+                for key in (list(item.meta.__class__.model_fields) + list(item.meta.get_custom_part()))
                 if (
-                    (
-                        params.allowed_meta_names is None
-                        or key in params.allowed_meta_names
-                    )
+                    (params.allowed_meta_names is None or key in params.allowed_meta_names)
                     and (key not in params.blocked_meta_names)
                     and (tmp := self._serialize_meta_field(item.meta, key))
                 )
@@ -369,28 +359,16 @@ def serialize(
 
     def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
         if (field_val := getattr(meta, name)) is not None:
-            if name == MetaFieldName.SUMMARY and isinstance(
-                field_val, SummaryMetaField
-            ):
+            if name == MetaFieldName.SUMMARY and isinstance(field_val, SummaryMetaField):
                 txt = f"<summary>{field_val.text}</summary>"
-            elif name == MetaFieldName.DESCRIPTION and isinstance(
-                field_val, DescriptionMetaField
-            ):
+            elif name == MetaFieldName.DESCRIPTION and isinstance(field_val, DescriptionMetaField):
                 txt = f"<description>{field_val.text}</description>"
-            elif name == MetaFieldName.CLASSIFICATION and isinstance(
-                field_val, PictureClassificationMetaField
-            ):
-                class_name = self._humanize_text(
-                    field_val.get_main_prediction().class_name
-                )
+            elif name == MetaFieldName.CLASSIFICATION and isinstance(field_val, PictureClassificationMetaField):
+                class_name = self._humanize_text(field_val.get_main_prediction().class_name)
                 txt = f"<classification>{class_name}</classification>"
-            elif name == MetaFieldName.MOLECULE and isinstance(
-                field_val, MoleculeMetaField
-            ):
+            elif name == MetaFieldName.MOLECULE and isinstance(field_val, MoleculeMetaField):
                 txt = f"<molecule>{field_val.smi}</molecule>"
-            elif name == MetaFieldName.TABULAR_CHART and isinstance(
-                field_val, TabularChartMetaField
-            ):
+            elif name == MetaFieldName.TABULAR_CHART and isinstance(field_val, TabularChartMetaField):
                 # suppressing tabular chart serialization
                 return None
             # elif tmp := str(field_val or ""):
@@ -419,7 +397,6 @@ def serialize(
         is_chart = False
 
         if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
-
             if item.meta:
                 meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
                 if meta_res.text:
@@ -508,12 +485,8 @@ def serialize_doc(
 
         text_res = tmp
 
-        if self.params.pretty_indentation and (
-            my_root := parseString(text_res).documentElement
-        ):
+        if self.params.pretty_indentation and (my_root := parseString(text_res).documentElement):
             text_res = my_root.toprettyxml(indent=self.params.pretty_indentation)
-            text_res = "\n".join(
-                [line for line in text_res.split("\n") if line.strip()]
-            )
+            text_res = "\n".join([line for line in text_res.split("\n") if line.strip()])
 
         return create_ser_result(text=text_res, span_source=parts)
diff --git a/docling_core/search/json_schema_to_search_mapper.py b/docling_core/search/json_schema_to_search_mapper.py
@@ -269,9 +269,7 @@ def __suppress(d_: Any) -> Any:
                 if suppress_key in d_ and d_[suppress_key] is True:
                     return {}
                 else:
-                    return {
-                        k: v for k, v in ((k, __suppress(v)) for k, v in d_.items())
-                    }
+                    return {k: v for k, v in ((k, __suppress(v)) for k, v in d_.items())}
             return d_
 
         return __suppress(doc)
@@ -325,12 +323,7 @@ def __remove(d_: Any) -> Any:
                 return [v for v in (__remove(v) for v in d_)]
 
             if isinstance(d_, dict):
-                return {
-                    k: v
-                    for k, v in (
-                        (k, __remove(v)) for k, v in d_.items() if not regx.match(k)
-                    )
-                }
+                return {k: v for k, v in ((k, __remove(v)) for k, v in d_.items() if not regx.match(k))}
 
             return d_
 
@@ -393,11 +386,7 @@ def _clean(d_: Any) -> Any:
                 return [v for v in (_clean(v) for v in d_) if not _empty(v)]
 
             if isinstance(d_, dict):
-                return {
-                    k: v
-                    for k, v in ((k, _clean(v)) for k, v in d_.items())
-                    if not _empty(v)
-                }
+                return {k: v for k, v in ((k, _clean(v)) for k, v in d_.items()) if not _empty(v)}
 
             return d_
 

diff --git a/docling_core/search/meta.py b/docling_core/search/meta.py
@@ -78,12 +78,8 @@ def version_has_schema(cls, v):
         """Validate that the docling-core library is always set in version field."""
         docling_core = [item for item in v if item.name == "docling-core"]
         if not docling_core:
-            raise ValueError(
-                "the version should include at least a valid docling-core package"
-            )
+            raise ValueError("the version should include at least a valid docling-core package")
         elif len(docling_core) > 1:
-            raise ValueError(
-                "the version must not include more than 1 docling-core package"
-            )
+            raise ValueError("the version must not include more than 1 docling-core package")
         else:
             return v
diff --git a/docling_core/search/package.py b/docling_core/search/package.py
@@ -22,8 +22,8 @@ class Package(BaseModel, extra="forbid"):
     """
 
     name: StrictStr = "docling-core"
-    version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = (
-        importlib.metadata.version("docling-core")
+    version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = importlib.metadata.version(
+        "docling-core"
     )
 
     def __hash__(self):

diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py
@@ -77,14 +77,7 @@ def contextualize(self, chunk: BaseChunk) -> str:
         for k in meta:
             if k not in chunk.meta.excluded_embed:
                 if isinstance(meta[k], list):
-                    items.append(
-                        self.delim.join(
-                            [
-                                d if isinstance(d, str) else json.dumps(d)
-                                for d in meta[k]
-                            ]
-                        )
-                    )
+                    items.append(self.delim.join([d if isinstance(d, str) else json.dumps(d) for d in meta[k]]))
                 else:
                     items.append(json.dumps(meta[k]))
         items.append(chunk.text)