Skip to content

Commit

Permalink
Run ruff format and check
Browse files Browse the repository at this point in the history
  • Loading branch information
ghodsizadeh committed Jan 5, 2025
1 parent ac7ad75 commit f1f92aa
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 32 deletions.
27 changes: 20 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files

- repo: local
hooks:
- id: ruff-format
name: ruff-format
entry: uv run ruff format
language: python
types_or: [ python, pyi ]
- id: ruff-lint
name: ruff-lint
entry: uv run ruff check
language: python
types_or: [ python, pyi ]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,4 @@ for df in dfs:

## TODO:
- [ ] Convert datatype to numeric
- [ ]
- [ ]
35 changes: 21 additions & 14 deletions pdf2csv/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,24 @@
def convert_cli(
pdf_path: str = typer.Argument(..., help="Path to the input PDF file."),
output_dir: Optional[str] = typer.Option(
'.', "--output-dir", "-o", help="Directory to save output files."
".", "--output-dir", "-o", help="Directory to save output files."
),
rtl: bool = typer.Option(
False,
False,
"--rtl/--no-rtl",
help="Whether to reverse text for right-to-left format (default=False). "
"Use '--rtl' to reverse the text."
"Use '--rtl' to reverse the text.",
),
output_format: Literal['csv', 'xlsx'] = typer.Option( # Update output_format parameter
'csv',
"--output-format",
"-f",
help="Format to save the output files. Options are 'csv' and 'xlsx'. Defaults to 'csv'."
output_format: Literal[
"csv", "xlsx"
] = typer.Option( # Update output_format parameter
"csv",
"--output-format",
"-f",
help="Format to save the output files. Options are 'csv' and 'xlsx'. Defaults to 'csv'.",
),
verbose: bool = typer.Option(
False,
"--verbose",
"-v",
help="Enable verbose (DEBUG) logging."
False, "--verbose", "-v", help="Enable verbose (DEBUG) logging."
),
):
"""
Expand All @@ -41,10 +40,18 @@ def convert_cli(
else:
logging.basicConfig(level=logging.INFO)

logging.info(f"Starting conversion for {pdf_path}, rtl={rtl}, output_format={output_format} ...")
logging.info(
f"Starting conversion for {pdf_path}, rtl={rtl}, output_format={output_format} ..."
)

try:
dfs = convert(pdf_path, output_dir=output_dir, rtl=rtl, output_format=output_format, index=False)
dfs = convert(
pdf_path,
output_dir=output_dir,
rtl=rtl,
output_format=output_format,
index=False,
)
logging.info(f"Extracted {len(dfs)} table(s) from {pdf_path}.")
except FileNotFoundError as fnf_err:
logging.error(str(fnf_err))
Expand Down
14 changes: 9 additions & 5 deletions pdf2csv/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def convert(
pdf_path: str,
output_dir: Optional[str] = None,
rtl: bool = False,
output_format: Literal['csv', 'xlsx'] = 'csv', # Use Literal for type checking
output_format: Literal["csv", "xlsx"] = "csv", # Use Literal for type checking
**kwargs: Any,
) -> List[pd.DataFrame]:
"""
Expand Down Expand Up @@ -110,12 +110,16 @@ def convert(

# Optionally save to CSV or XLSX
if output_dir_path is not None:
if output_format == 'csv':
csv_filename = output_dir_path / f"{doc_filename}-table-{table_idx}.csv"
if output_format == "csv":
csv_filename = (
output_dir_path / f"{doc_filename}-table-{table_idx}.csv"
)
df.to_csv(csv_filename, **kwargs)
_log.info(f"Saved CSV table #{table_idx} to: {csv_filename}")
elif output_format == 'xlsx':
xlsx_filename = output_dir_path / f"{doc_filename}-table-{table_idx}.xlsx"
elif output_format == "xlsx":
xlsx_filename = (
output_dir_path / f"{doc_filename}-table-{table_idx}.xlsx"
)
df.to_excel(xlsx_filename, **kwargs)
_log.info(f"Saved XLSX table #{table_idx} to: {xlsx_filename}")
else:
Expand Down
5 changes: 3 additions & 2 deletions pdf2csv/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd

def ensure_numeric_columns(df: pd.DataFrame, errors: str = 'coerce') -> pd.DataFrame:

def ensure_numeric_columns(df: pd.DataFrame, errors: str = "coerce") -> pd.DataFrame:
"""
Ensure all columns which are numbers are considered as numeric.
Expand All @@ -18,5 +19,5 @@ def ensure_numeric_columns(df: pd.DataFrame, errors: str = 'coerce') -> pd.DataF
The processed DataFrame with numeric columns converted.
"""
for col in df.columns:
df[col] = pd.to_numeric(df[col], errors='ignore')
df[col] = pd.to_numeric(df[col], errors="ignore")
return df
5 changes: 2 additions & 3 deletions tests/test_convertor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pdf2csv import convert
import pytest


@pytest.fixture
def rtl_pdf():
return "tests/assets/rtl_test.pdf"
Expand All @@ -11,9 +12,7 @@ def test_rtl_convert(rtl_pdf):
dfs = convert(pdf_path, rtl=True)
assert len(dfs) == 1
df = dfs[0]
df.columns[0] == 'بلندمدت ميانگين.اختالف نسبت به درصد'
df.columns[0] == "بلندمدت ميانگين.اختالف نسبت به درصد"
assert len(df.columns) == 6
assert len(df) == 10
assert df.iloc[0, 0] == -44


0 comments on commit f1f92aa

Please sign in to comment.