Skip to content

Commit

Permalink
Merge pull request #225 from openpreserve/dev/refactor-arg-parsing
Browse files Browse the repository at this point in the history
Move CLI arg parsing out of fido.py into cli_args.py
  • Loading branch information
carlwilson authored Sep 23, 2024
2 parents 54cb23f + 529b483 commit 1592856
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 132 deletions.
109 changes: 109 additions & 0 deletions fido/cli_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import argparse
from argparse import ArgumentParser, RawTextHelpFormatter
from typing import Any, Dict, List


def parse_cli_args(argv: List[str], defaults: Dict[str, Any]) -> argparse.Namespace:
"""
Parse command-line arguments.
Args:
argv (list[str]): List of command-line arguments. Could be sys.argv
defaults (dict): Dictionary of default values. Expects to find configdir, bufsize and container_bufsize.
Returns:
argparse.Namespace: Parsed command-line arguments. Reference via name as in args.v or args.recurse.
"""

parser = ArgumentParser(
description=defaults["description"],
epilog=defaults["epilog"],
fromfile_prefix_chars="@",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("-v", default=False, action="store_true", help="show version information")
parser.add_argument("-q", default=False, action="store_true", help="run (more) quietly")
parser.add_argument("-recurse", default=False, action="store_true", help="recurse into subdirectories")
parser.add_argument("-zip", default=False, action="store_true", help="recurse into zip and tar files")
parser.add_argument(
"-noextension",
default=False,
action="store_true",
help="disable extension matching, reduces number of matches but may reduce false positives",
)
parser.add_argument(
"-nocontainer",
default=False,
action="store_true",
help="disable deep scan of container documents, increases speed but may reduce accuracy with big files",
)
parser.add_argument(
"-pronom_only",
default=False,
action="store_true",
help="disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results",
)

group = parser.add_mutually_exclusive_group()
group.add_argument(
"-input", default=False, help="file containing a list of files to check, one per line. - means stdin"
)
group.add_argument(
"files",
nargs="*",
default=[],
metavar="FILE",
help="files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.",
)

parser.add_argument("-filename", default=None, help="filename if file contents passed through STDIN")
parser.add_argument(
"-useformats",
metavar="INCLUDEPUIDS",
default=None,
help="comma separated string of formats to use in identification",
)
parser.add_argument(
"-nouseformats",
metavar="EXCLUDEPUIDS",
default=None,
help="comma separated string of formats not to use in identification",
)
parser.add_argument(
"-matchprintf",
metavar="FORMATSTRING",
default=None,
help="format string (Python style) to use on match. See nomatchprintf, README.txt.",
)
parser.add_argument(
"-nomatchprintf",
metavar="FORMATSTRING",
default=None,
help="format string (Python style) to use if no match. See README.txt",
)
parser.add_argument(
"-bufsize",
type=int,
default=None,
help=f"size (in bytes) of the buffer to match against (default={defaults['bufsize']})",
)
parser.add_argument(
"-sigs",
default=None,
metavar="SIG_ACT",
help='SIG_ACT "check" for new version\nSIG_ACT "update" to latest\nSIG_ACT "list" available versions\nSIG_ACT "n" use version n.',
)
parser.add_argument(
"-container_bufsize",
type=int,
default=None,
help=f"size (in bytes) of the buffer to match against (default={defaults['container_bufsize']}).",
)
parser.add_argument(
"-loadformats", default=None, metavar="XML1,...,XMLn", help="comma separated string of XML format files to add."
)
parser.add_argument(
"-confdir",
default=defaults["config_dir"],
help="configuration directory to load_fido_xml, for example, the format specifications from.",
)

return parser.parse_args(argv)
145 changes: 25 additions & 120 deletions fido/fido.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,20 @@
import sys
import tarfile
import tempfile
from argparse import ArgumentParser, RawTextHelpFormatter
from contextlib import closing

try:
from time import perf_counter
except ImportError:
from time import clock as perf_counter

import zipfile
from contextlib import closing
from time import perf_counter
from typing import Optional
from xml.etree import cElementTree as ET

from fido import CONFIG_DIR, __version__
from fido.char_handler import escape
from fido.cli_args import parse_cli_args
from fido.package import OlePackage, ZipPackage
from fido.versions import get_local_versions, sig_file_actions

defaults = {
"config_dir": CONFIG_DIR,
"bufsize": 128 * 1024, # (bytes)
"regexcachesize": 2084, # (bytes)
"printmatch": 'OK,%(info.time)s,%(info.puid)s,"%(info.formatname)s","%(info.signaturename)s",%(info.filesize)s,"%(info.filename)s","%(info.mimetype)s","%(info.matchtype)s"\n',
Expand Down Expand Up @@ -74,8 +71,8 @@ class Fido:

def __init__(
self,
quiet=False,
bufsize=None,
quiet: bool = False,
bufsize: Optional[int] = None,
container_bufsize=None,
printnomatch=None,
printmatch=None,
Expand Down Expand Up @@ -637,7 +634,12 @@ def walk_zip(self, filename, fileobj=None, extension=True):
with zipstream.open(item) as source:
self.copy_stream(source, target)
# target.seek(0)
self.identify_contents(item_name, target, self.container_type(matches), extension=extension)
self.identify_contents(
item_name,
target,
self.container_type(matches),
extension=extension,
)
except IOError:
sys.stderr.write("FIDO: ZipError {0}\n".format(filename))
except zipfile.BadZipfile:
Expand Down Expand Up @@ -666,7 +668,12 @@ def walk_tar(self, filename, fileobj, extension=True):
self.handle_matches(tar_item_name, matches, timer.duration())
if self.container_type(matches):
f.seek(0)
self.identify_contents(tar_item_name, f, self.container_type(matches), extension=extension)
self.identify_contents(
tar_item_name,
f,
self.container_type(matches),
extension=extension,
)
except tarfile.TarError:
sys.stderr.write("FIDO: Error: TarError {0}\n".format(filename))

Expand Down Expand Up @@ -793,107 +800,7 @@ def main(args=None):
"""Main FIDO method."""
if not args:
args = sys.argv[1:]

parser = ArgumentParser(
description=defaults["description"],
epilog=defaults["epilog"],
fromfile_prefix_chars="@",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("-v", default=False, action="store_true", help="show version information")
parser.add_argument("-q", default=False, action="store_true", help="run (more) quietly")
parser.add_argument("-recurse", default=False, action="store_true", help="recurse into subdirectories")
parser.add_argument("-zip", default=False, action="store_true", help="recurse into zip and tar files")
parser.add_argument(
"-noextension",
default=False,
action="store_true",
help="disable extension matching, reduces number of matches but may reduce false positives",
)
parser.add_argument(
"-nocontainer",
default=False,
action="store_true",
help="disable deep scan of container documents, increases speed but may reduce accuracy with big files",
)
parser.add_argument(
"-pronom_only",
default=False,
action="store_true",
help="disables loading of format extensions file, only PRONOM signatures are loaded, may reduce accuracy of results",
)

group = parser.add_mutually_exclusive_group()
group.add_argument(
"-input", default=False, help="file containing a list of files to check, one per line. - means stdin"
)
group.add_argument(
"files",
nargs="*",
default=[],
metavar="FILE",
help="files to check. If the file is -, then read content from stdin. In this case, python must be invoked with -u or it may convert the line terminators.",
)

parser.add_argument("-filename", default=None, help="filename if file contents passed through STDIN")
parser.add_argument(
"-useformats",
metavar="INCLUDEPUIDS",
default=None,
help="comma separated string of formats to use in identification",
)
parser.add_argument(
"-nouseformats",
metavar="EXCLUDEPUIDS",
default=None,
help="comma separated string of formats not to use in identification",
)
parser.add_argument(
"-matchprintf",
metavar="FORMATSTRING",
default=None,
help="format string (Python style) to use on match. See nomatchprintf, README.txt.",
)
parser.add_argument(
"-nomatchprintf",
metavar="FORMATSTRING",
default=None,
help="format string (Python style) to use if no match. See README.txt",
)
parser.add_argument(
"-bufsize",
type=int,
default=None,
help="size (in bytes) of the buffer to match against (default=" + str(defaults["bufsize"]) + " bytes)",
)
parser.add_argument(
"-sigs",
default=None,
metavar="SIG_ACT",
help='SIG_ACT "check" for new version\nSIG_ACT "update" to latest\nSIG_ACT "list" available versions\nSIG_ACT "n" use version n.',
)
parser.add_argument(
"-container_bufsize",
type=int,
default=None,
help="size (in bytes) of the buffer to match against (default="
+ str(defaults["container_bufsize"])
+ " bytes)",
)
parser.add_argument(
"-loadformats", default=None, metavar="XML1,...,XMLn", help="comma separated string of XML format files to add."
)
parser.add_argument(
"-confdir",
default=CONFIG_DIR,
help="configuration directory to load_fido_xml, for example, the format specifications from.",
)

if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args(args)

args = parse_cli_args(args, defaults)
timer = PerfTimer()

versions = get_local_versions(args.confdir)
Expand All @@ -904,15 +811,13 @@ def main(args=None):
defaults["format_files"] = [defaults["xml_pronomSignature"]]

if args.pronom_only:
versionHeader = "FIDO v{0} ({1}, {2})\n".format(
__version__, defaults["xml_pronomSignature"], defaults["containersignature_file"]
versionHeader = (
f"FIDO v{__version__} ({defaults['xml_pronomSignature']}, {defaults['containersignature_file']})\n"
)
else:
versionHeader = "FIDO v{0} ({1}, {2}, {3})\n".format(
__version__,
defaults["xml_pronomSignature"],
defaults["containersignature_file"],
defaults["xml_fidoExtensionSignature"],
versionHeader = (
f"FIDO v{__version__} ({defaults['xml_pronomSignature']}, {defaults['containersignature_file']}, "
f"{defaults['xml_fidoExtensionSignature']})\n"
)
defaults["format_files"].append(defaults["xml_fidoExtensionSignature"])

Expand Down
6 changes: 1 addition & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
[build-system]
requires = ["setuptools>=42", "wheel", "setuptools-git-versioning>=2.0,<3"]
requires = ["setuptools>=42", "wheel", "twine", "setuptools-git-versioning>=2.0,<3"]
build-backend = "setuptools.build_meta"

# These were in requirements/packing.txt
# twine>=1.8,<1.9
# wheel==0.38.1

[project]
name = "opf-fido"
dynamic = ["version"]
Expand Down
50 changes: 50 additions & 0 deletions tests/test_cli_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pytest

from fido.cli_args import parse_cli_args
from fido.fido import defaults

# Common argument string


def test_parse_args_input_valid():
arg_string = (
"-v -q -recurse -zip -noextension -nocontainer -pronom_only "
"-input files.txt "
"-useformats=fmt1,fmt2 -nouseformats=fmt3,fmt4"
)
args = parse_cli_args(arg_string.split(), defaults)
assert args.v
assert args.q
assert args.recurse
assert args.zip
assert args.noextension
assert args.nocontainer
assert args.pronom_only
assert args.input == "files.txt"
assert args.useformats == "fmt1,fmt2"
assert args.nouseformats == "fmt3,fmt4"


def test_parse_args_files_valid():
arg_string = "-q -zip file1.ext file2.ext"
args = parse_cli_args(arg_string.split(), defaults)
assert args.q
assert args.zip
assert not args.noextension
assert not args.nocontainer
assert not args.pronom_only
assert args.files == ["file1.ext", "file2.ext"]
assert args.useformats is None
assert args.nouseformats is None


def test_parse_args_invalid():
arg_string = "-q -zip -bad_arg file1.ext file2.ext"
with pytest.raises(SystemExit):
parse_cli_args(arg_string.split(), defaults)


def test_parse_files_and_input_invalid():
arg_string = "-q -zip -input files.txt file1.ext file2.ext"
with pytest.raises(SystemExit):
parse_cli_args(arg_string.split(), defaults)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
16 changes: 9 additions & 7 deletions tests/test_package.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import os

from fido.package import ZipPackage
import pytest

from fido.package import ZipPackage

FIXTURES_DIR = os.path.normpath(os.path.join(__file__, '..', 'fixtures'))
TEST_DATA_BAD_PACKAGES = os.path.normpath(os.path.join(__file__, "..", "test_data/hard_packages"))


def test_bad_zips():
for filename in ('bad.zip', 'worse.zip', 'unicode.zip'):
p = ZipPackage(os.path.join(FIXTURES_DIR, filename), {})
r = p.detect_formats()
assert isinstance(r, list) and len(r) == 0
# None of these files should be identified as packages?
@pytest.mark.parametrize("filename", ["bad.zip", "worse.zip", "unicode.zip", "foo.zip", "foo.tar"])
def test_bad_zip(filename):
p = ZipPackage(os.path.join(TEST_DATA_BAD_PACKAGES, filename), {})
r = p.detect_formats()
assert isinstance(r, list) and len(r) == 0

0 comments on commit 1592856

Please sign in to comment.