Skip to content

Commit d184e47

Browse files
Add various functions to onboard_project (#846)
* Added various args to onboard_project -Added zip file support with --zip-password arg -Added --clean-project arg to run clean_project before extraction -Added --timestamp arg to append a timestamp to the project name -Added --wildebeest arg to run a Wildebeest analysis on the extracted project * Remove --zip-password, switch to password prompt * Rename variables and methods
1 parent 2c2a2c1 commit d184e47

File tree

4 files changed

+175
-64
lines changed

4 files changed

+175
-64
lines changed

poetry.lock

Lines changed: 42 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ openpyxl = "^3.1.2"
100100
packaging = "^22.0"
101101
hanzidentifier = "^1.2.0"
102102
jarowinkler = "^2.0.1"
103+
wildebeest-nlp = ">=0.9.0"
103104

104105
[tool.poetry.group.dev.dependencies]
105106
types-pyyaml = "^6.0.12.12"

scripts/clean_projects.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,19 @@
33
import shutil
44
from datetime import datetime
55
from pathlib import Path
6+
67
from tqdm import tqdm
78

9+
810
def parse_arguments():
911
parser = argparse.ArgumentParser(description="Clean up unnecessary files and folders.")
10-
parser.add_argument("--input", type=Path, default=Path("S:/Paratext/projects"), help="Folder to search.")
12+
parser.add_argument("--input", type=Path, default=Path("/root/M/Paratext/projects"), help="Folder to search.")
1113
parser.add_argument("--delete-subfolders", action="store_true", help="Delete subfolders inside projects.")
1214
parser.add_argument("--confirm-delete", action="store_true", help="Skip confirmation and delete directly.")
1315
parser.add_argument("--dry-run", action="store_true", help="Generate a CSV report without deleting.")
1416
return parser.parse_args()
1517

18+
1619
def should_delete(path: Path):
1720
patterns = [
1821
"Notes",
@@ -26,6 +29,7 @@ def should_delete(path: Path):
2629
]
2730
return any(pattern in path.name for pattern in patterns)
2831

32+
2933
def find_items_to_delete(root_path: Path, delete_subfolders: bool):
3034
files_to_delete = []
3135
folders_to_delete = []
@@ -36,19 +40,20 @@ def find_items_to_delete(root_path: Path, delete_subfolders: bool):
3640
print(f" Warning: Ignoring symlink found: {project_folder}")
3741
continue
3842
if project_folder.is_dir():
39-
for path in project_folder.glob('*'):
43+
for path in project_folder.glob("*"):
4044
if path.is_file() and should_delete(path):
4145
files_to_delete.append(path)
4246
if path.is_dir() and delete_subfolders:
4347
folders_to_delete.append(path)
4448

4549
return files_to_delete, folders_to_delete
4650

47-
def execute_and_report(args):
51+
52+
def clean_projects(args):
4853
now = datetime.now()
49-
now_filestamp = now.strftime('%Y%m%d_%H%M%S')
50-
now_csv_date = now.strftime('%Y %m %d')
51-
now_csv_time = now.strftime('%H:%M:%S')
54+
now_filestamp = now.strftime("%Y%m%d_%H%M%S")
55+
now_csv_date = now.strftime("%Y %m %d")
56+
now_csv_time = now.strftime("%H:%M:%S")
5257

5358
# Find files/folders to delete, with subfolder handling based on the option
5459
files_to_delete, folders_to_delete = find_items_to_delete(args.input, args.delete_subfolders)
@@ -64,7 +69,7 @@ def execute_and_report(args):
6469
csv_writer.writerow(["Path", "Type", "Size (bytes)", "Deleted"])
6570

6671
for folder_to_delete in folders_to_delete:
67-
size = sum(f.stat().st_size for f in folder_to_delete.glob('*') if f.is_file())
72+
size = sum(f.stat().st_size for f in folder_to_delete.glob("*") if f.is_file())
6873
total_size += size
6974

7075
deleted = "No" if args.dry_run else try_delete(folder_to_delete, args)
@@ -88,23 +93,25 @@ def delete_item(item):
8893
shutil.rmtree(item)
8994
return
9095

96+
9197
def try_delete(item: Path, args) -> str:
9298

9399
if args.confirm_delete:
94100
delete_item(item)
95101
return "Yes"
96102
else:
97103
confirmation = input(f"Delete {item}? (y/n): ").strip().lower()
98-
if confirmation == 'y':
104+
if confirmation == "y":
99105
delete_item(item)
100106
return "Yes"
101107
else:
102108
return "Skipped"
103109

104110

105111
def main():
106-
args = parse_arguments()
107-
execute_and_report(args)
112+
args = parse_arguments()
113+
clean_projects(args)
114+
108115

109116
if __name__ == "__main__":
110117
main()

silnlp/common/onboard_project.py

Lines changed: 115 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
11
import argparse
2+
import getpass
23
import logging
4+
import tempfile
5+
import zipfile
6+
from datetime import datetime
37
from pathlib import Path
48

9+
import wildebeest.wb_analysis as wb_ana
510
import yaml
611

12+
from scripts.clean_projects import clean_projects
13+
714
from .collect_verse_counts import collect_verse_counts
815
from .environment import SIL_NLP_ENV
916
from .extract_corpora import extract_corpora
@@ -58,6 +65,50 @@ def copy_paratext_project_folder(source_dir: Path, project_name: str, overwrite=
5865
_copy_file_to_paratext_project(source_item, target_item, overwrite=overwrite)
5966

6067

68+
def collect_verse_counts_wrapper(project_name: str, verse_counts_config: dict) -> None:
69+
70+
output_folder = Path(
71+
verse_counts_config.get("output_folder", SIL_NLP_ENV.mt_experiments_dir / "verse_counts" / project_name)
72+
)
73+
if not output_folder.exists():
74+
output_folder.mkdir(parents=True, exist_ok=True)
75+
76+
input_folder = verse_counts_config.get("input_folder", SIL_NLP_ENV.mt_scripture_dir)
77+
78+
file_patterns = verse_counts_config.get("files", f"*{project_name}*.txt")
79+
80+
input_folder_path = Path(input_folder)
81+
if not input_folder_path.exists():
82+
LOGGER.error(f"Input folder '{input_folder_path}' does not exist. Skipping verse counts collection.")
83+
return
84+
85+
matched_files = list(input_folder_path.glob(file_patterns))
86+
if not matched_files:
87+
LOGGER.error(
88+
f"No files matching pattern '{file_patterns}' found in '{input_folder_path}'. Skipping verse counts collection."
89+
)
90+
return
91+
92+
collect_verse_counts(
93+
input_folder=input_folder_path,
94+
output_folder=output_folder,
95+
file_patterns=file_patterns,
96+
deutero=verse_counts_config.get("deutero", False),
97+
recount=verse_counts_config.get("recount", False),
98+
)
99+
100+
101+
def get_config(config_path: str) -> dict:
102+
if config_path:
103+
config_file = Path(config_path)
104+
if not config_file.exists():
105+
raise FileNotFoundError(f"Config file '{config_file}' does not exist.")
106+
with config_file.open("r", encoding="utf-8") as file:
107+
return yaml.safe_load(file)
108+
else:
109+
return {}
110+
111+
61112
def main() -> None:
62113
parser = argparse.ArgumentParser(
63114
description="Performs several steps to onboard a new project before training a model.",
@@ -97,84 +148,95 @@ def main() -> None:
97148
action="store_true",
98149
help="Collect various counts from the extracted Paratext project.",
99150
)
151+
parser.add_argument(
152+
"--clean-project",
153+
default=False,
154+
action="store_true",
155+
help="Cleans the Paratext project folder by removing unnecessary files and folders.",
156+
)
157+
parser.add_argument(
158+
"--timestamp",
159+
default=False,
160+
action="store_true",
161+
help="Add a timestamp to the project folder name when creating a new Paratext project folder.",
162+
)
163+
parser.add_argument(
164+
"--wildebeest", default=False, action="store_true", help="Run Wildebeest analysis on the extracted corpora."
165+
)
100166

101167
args = parser.parse_args()
102168
if not args.project:
103169
raise ValueError("Project name is required. Please provide a valid Paratext project name using <project>.")
104170

171+
config = get_config(args.config) if args.config else {}
172+
173+
if args.project.endswith(".zip"):
174+
with zipfile.ZipFile(args.project, "r") as zip_ref:
175+
# Check if any file in the zip is encrypted
176+
temp_dir = tempfile.TemporaryDirectory()
177+
needs_password = any(zinfo.flag_bits & 0x1 for zinfo in zip_ref.infolist())
178+
if needs_password:
179+
pwd = getpass.getpass(prompt=f"Enter password for zip file '{args.project}': ")
180+
zip_ref.extractall(temp_dir.name, pwd=pwd.encode())
181+
else:
182+
zip_ref.extractall(temp_dir.name)
183+
args.copy_from = temp_dir.name
184+
args.project = Path(args.project).stem
185+
105186
project_name = args.project
187+
if args.timestamp:
188+
189+
now = datetime.now()
190+
timestamp = now.strftime("%Y_%m_%d")
191+
project_name = f"{args.project}_{timestamp}"
192+
LOGGER.info(f"Timestamping project. New project name: {project_name}")
106193

107194
if args.copy_from:
108195
LOGGER.info(f"Onboarding project: {args.project}")
109196
paratext_project_dir: Path = create_paratext_project_folder_if_not_exists(project_name)
110197
copy_paratext_project_folder(Path(args.copy_from), paratext_project_dir, overwrite=args.overwrite)
111198

112-
if args.config:
113-
config_file = Path(args.config)
114-
if not config_file.exists():
115-
raise FileNotFoundError(f"Config file '{config_file}' does not exist.")
116-
with config_file.open("r", encoding="utf-8") as file:
117-
config = yaml.safe_load(file)
118-
else:
119-
raise ValueError("Config file is required. Please provide a valid configuration file using --config.")
199+
if args.clean_project:
200+
LOGGER.info(f"Cleaning Paratext project folder for {project_name}.")
201+
clean_projects(
202+
argparse.Namespace(
203+
input=get_paratext_project_dir(project_name),
204+
delete_subfolders=True,
205+
confirm_delete=True,
206+
dry_run=False,
207+
)
208+
)
120209

121210
if args.extract_corpora:
122-
LOGGER.info(f"Extracting {project_name}.")
211+
extract_config = config.get("extract_corpora", {})
123212
extract_corpora(
124213
projects={project_name},
125-
books_to_include=config["extract_corpora"]["include"] if "include" in config["extract_corpora"] else [],
126-
books_to_exclude=config["extract_corpora"]["exclude"] if "exclude" in config["extract_corpora"] else [],
127-
include_markers=(config["extract_corpora"]["markers"] if "markers" in config["extract_corpora"] else False),
128-
extract_lemmas=config["extract_corpora"]["lemmas"] if "lemmas" in config["extract_corpora"] else False,
129-
extract_project_vrefs=(
130-
config["extract_corpora"]["project-vrefs"] if "project-vrefs" in config["extract_corpora"] else False
131-
),
214+
books_to_include=extract_config.get("include", []),
215+
books_to_exclude=extract_config.get("exclude", []),
216+
include_markers=extract_config.get("markers", False),
217+
extract_lemmas=extract_config.get("lemmas", False),
218+
extract_project_vrefs=extract_config.get("project-vrefs", False),
132219
)
133220

134221
if args.collect_verse_counts:
135222
if not args.extract_corpora:
136223
LOGGER.warning(
137224
"--extract_corpora was not included. Collecting verse counts requires the corpus to be extracted first."
138225
)
139-
140226
LOGGER.info(f"Collecting verse counts from {project_name}.")
227+
collect_verse_counts_wrapper(project_name, config.get("verse_counts", {}))
141228

142-
if config["verse_counts"]["output_folder"]:
143-
output_folder = Path(config["verse_counts"]["output_folder"])
144-
if not output_folder.exists():
145-
output_folder.mkdir(parents=True, exist_ok=True)
146-
else:
147-
output_folder = SIL_NLP_ENV.mt_experiments_dir / "verse_counts" / project_name
148-
if not output_folder.exists():
149-
output_folder.mkdir(parents=True, exist_ok=True)
150-
input_folder = (
151-
config["verse_counts"]["input_folder"]
152-
if "input_folder" in config["verse_counts"]
153-
else SIL_NLP_ENV.mt_scripture_dir
154-
)
155-
file_patterns = (
156-
config["verse_counts"]["files"] if "files" in config["verse_counts"] else f"*{project_name}*.txt"
157-
)
158-
159-
input_folder_path = Path(input_folder)
160-
if not input_folder_path.exists():
161-
LOGGER.error(f"Input folder '{input_folder_path}' does not exist. Skipping verse counts collection.")
162-
return
163-
164-
matched_files = list(input_folder_path.glob(file_patterns))
165-
if not matched_files:
166-
LOGGER.error(
167-
f"No files matching pattern '{file_patterns}' found in '{input_folder_path}'. Skipping verse counts collection."
168-
)
169-
return
170-
171-
collect_verse_counts(
172-
input_folder=input_folder_path,
173-
output_folder=output_folder,
174-
file_patterns=file_patterns,
175-
deutero=config["verse_counts"]["deutero"] if "deutero" in config["verse_counts"] else False,
176-
recount=config["verse_counts"]["recount"] if "recount" in config["verse_counts"] else False,
177-
)
229+
if args.wildebeest:
230+
if not args.extract_corpora:
231+
LOGGER.warning("--extract_corpora was not included. Wildebeest requires the corpus to be extracted first.")
232+
233+
extract_file = list(SIL_NLP_ENV.mt_scripture_dir.glob(f"*{project_name}.txt"))[0]
234+
LOGGER.info(f"Running Wildebeest analysis on {extract_file}.")
235+
with (
236+
open(f"{project_name}_wildebeest.json", "w", encoding="utf-8") as json_f,
237+
open(f"{project_name}_wildebeest.txt", "w", encoding="utf-8") as txt_f,
238+
):
239+
wb_ana.process(in_file=extract_file, json_output=json_f, pp_output=txt_f)
178240

179241

180242
if __name__ == "__main__":

0 commit comments

Comments
 (0)