Skip to content

Commit 3c419e8

Browse files
Improvements to onboard_project (#848)
* Made several improvements to onboard_project -Can now onboard multiple projects - --copy-from now will use the user's Downloads folder if included with no value -clean_project changed from silnlp.scripts.clean_project to silnlp.common.clean_projects so multiple projects can be cleaned at once and more unncessary files are removed -Added a wildebeest section to the config, and default args for wildebeest analysis -Added a zip_password section to the config for encrypted zip files
1 parent d184e47 commit 3c419e8

File tree

1 file changed

+101
-73
lines changed

1 file changed

+101
-73
lines changed

silnlp/common/onboard_project.py

Lines changed: 101 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import argparse
22
import getpass
33
import logging
4+
import sys
45
import tempfile
56
import zipfile
67
from datetime import datetime
@@ -9,7 +10,7 @@
910
import wildebeest.wb_analysis as wb_ana
1011
import yaml
1112

12-
from scripts.clean_projects import clean_projects
13+
import silnlp.common.clean_projects as clean_projects
1314

1415
from .collect_verse_counts import collect_verse_counts
1516
from .environment import SIL_NLP_ENV
@@ -115,21 +116,24 @@ def main() -> None:
115116
)
116117

117118
parser.add_argument(
118-
"project",
119+
"projects",
119120
help="Paratext project name. The project will be stored on the bucket at Paratext/projects/<project>.",
120-
type=str,
121+
nargs="*",
122+
default=None,
121123
)
122124
parser.add_argument(
123125
"--copy-from",
124-
help="Path to a downloaded Paratext project folder. The local project will be copied to the bucket.",
126+
help="Path to a downloaded Paratext project folder. The local project will be copied to the bucket. If provided without a value, uses the user's Downloads directory.",
127+
nargs="?",
128+
const=Path.home() / "Downloads",
125129
default=None,
126-
type=str,
130+
type=Path,
127131
)
128132
parser.add_argument(
129133
"--config",
130134
help="Path to a configuration file in YAML format. This is used to configure the onboarding process.",
131135
default=None,
132-
type=str,
136+
type=Path,
133137
)
134138
parser.add_argument(
135139
"--overwrite", help="Overwrite any existing files and folders", default=False, action="store_true"
@@ -152,7 +156,7 @@ def main() -> None:
152156
"--clean-project",
153157
default=False,
154158
action="store_true",
155-
help="Cleans the Paratext project folder by removing unnecessary files and folders.",
159+
help="Cleans the Paratext project folder by removing unnecessary files and folders before copying. Only used if --copy-from is provided.",
156160
)
157161
parser.add_argument(
158162
"--timestamp",
@@ -165,78 +169,102 @@ def main() -> None:
165169
)
166170

167171
args = parser.parse_args()
168-
if not args.project:
172+
if not args.projects:
169173
raise ValueError("Project name is required. Please provide a valid Paratext project name using <project>.")
170174

171175
config = get_config(args.config) if args.config else {}
172176

173-
if args.project.endswith(".zip"):
174-
with zipfile.ZipFile(args.project, "r") as zip_ref:
175-
# Check if any file in the zip is encrypted
176-
temp_dir = tempfile.TemporaryDirectory()
177-
needs_password = any(zinfo.flag_bits & 0x1 for zinfo in zip_ref.infolist())
178-
if needs_password:
179-
pwd = getpass.getpass(prompt=f"Enter password for zip file '{args.project}': ")
180-
zip_ref.extractall(temp_dir.name, pwd=pwd.encode())
181-
else:
182-
zip_ref.extractall(temp_dir.name)
183-
args.copy_from = temp_dir.name
184-
args.project = Path(args.project).stem
185-
186-
project_name = args.project
187-
if args.timestamp:
188-
189-
now = datetime.now()
190-
timestamp = now.strftime("%Y_%m_%d")
191-
project_name = f"{args.project}_{timestamp}"
192-
LOGGER.info(f"Timestamping project. New project name: {project_name}")
193-
194-
if args.copy_from:
195-
LOGGER.info(f"Onboarding project: {args.project}")
196-
paratext_project_dir: Path = create_paratext_project_folder_if_not_exists(project_name)
197-
copy_paratext_project_folder(Path(args.copy_from), paratext_project_dir, overwrite=args.overwrite)
198-
199-
if args.clean_project:
200-
LOGGER.info(f"Cleaning Paratext project folder for {project_name}.")
201-
clean_projects(
202-
argparse.Namespace(
203-
input=get_paratext_project_dir(project_name),
204-
delete_subfolders=True,
205-
confirm_delete=True,
206-
dry_run=False,
177+
if args.clean_project and args.copy_from:
178+
LOGGER.info("Cleaning Paratext project folders.")
179+
old_argv = sys.argv
180+
try:
181+
sys.argv = ["--folders", str(args.copy_from)]
182+
clean_projects.main()
183+
finally:
184+
sys.argv = old_argv
185+
186+
for project in args.projects:
187+
if project.endswith(".zip"):
188+
with zipfile.ZipFile(project, "r") as zip_ref:
189+
# Check if any file in the zip is encrypted
190+
temp_dir = tempfile.TemporaryDirectory()
191+
needs_password = any(zinfo.flag_bits & 0x1 for zinfo in zip_ref.infolist())
192+
if needs_password:
193+
if config.get("zip_passwords"):
194+
pwd = config["zip_passwords"].get(project, None)
195+
if not pwd:
196+
pwd = getpass.getpass(prompt=f"Enter password for {project}: ")
197+
zip_ref.extractall(temp_dir.name, pwd=pwd.encode())
198+
else:
199+
zip_ref.extractall(temp_dir.name)
200+
args.copy_from = temp_dir.name
201+
project = Path(project).stem
202+
203+
project_name = project
204+
if args.timestamp:
205+
206+
now = datetime.now()
207+
timestamp = now.strftime("%Y_%m_%d")
208+
project_name = f"{project}_{timestamp}"
209+
LOGGER.info(f"Timestamping project. New project name: {project_name}")
210+
211+
if args.copy_from:
212+
LOGGER.info(f"Copying project: {project}")
213+
paratext_project_dir: Path = create_paratext_project_folder_if_not_exists(project_name)
214+
source_path = Path(args.copy_from)
215+
if source_path.name != project_name:
216+
source_path = Path(source_path / project_name)
217+
copy_paratext_project_folder(source_path, paratext_project_dir, overwrite=args.overwrite)
218+
219+
if args.extract_corpora:
220+
extract_config: dict = config.get("extract_corpora", {})
221+
extract_corpora(
222+
projects={project_name},
223+
books_to_include=extract_config.get("include", []),
224+
books_to_exclude=extract_config.get("exclude", []),
225+
include_markers=extract_config.get("markers", False),
226+
extract_lemmas=extract_config.get("lemmas", False),
227+
extract_project_vrefs=extract_config.get("project-vrefs", False),
207228
)
208-
)
209229

210-
if args.extract_corpora:
211-
extract_config = config.get("extract_corpora", {})
212-
extract_corpora(
213-
projects={project_name},
214-
books_to_include=extract_config.get("include", []),
215-
books_to_exclude=extract_config.get("exclude", []),
216-
include_markers=extract_config.get("markers", False),
217-
extract_lemmas=extract_config.get("lemmas", False),
218-
extract_project_vrefs=extract_config.get("project-vrefs", False),
219-
)
220-
221-
if args.collect_verse_counts:
222-
if not args.extract_corpora:
223-
LOGGER.warning(
224-
"--extract_corpora was not included. Collecting verse counts requires the corpus to be extracted first."
225-
)
226-
LOGGER.info(f"Collecting verse counts from {project_name}.")
227-
collect_verse_counts_wrapper(project_name, config.get("verse_counts", {}))
228-
229-
if args.wildebeest:
230-
if not args.extract_corpora:
231-
LOGGER.warning("--extract_corpora was not included. Wildebeest requires the corpus to be extracted first.")
232-
233-
extract_file = list(SIL_NLP_ENV.mt_scripture_dir.glob(f"*{project_name}.txt"))[0]
234-
LOGGER.info(f"Running Wildebeest analysis on {extract_file}.")
235-
with (
236-
open(f"{project_name}_wildebeest.json", "w", encoding="utf-8") as json_f,
237-
open(f"{project_name}_wildebeest.txt", "w", encoding="utf-8") as txt_f,
238-
):
239-
wb_ana.process(in_file=extract_file, json_output=json_f, pp_output=txt_f)
230+
if args.collect_verse_counts:
231+
if not args.extract_corpora:
232+
LOGGER.warning(
233+
"--extract_corpora was not included. Collecting verse counts requires the corpus to be extracted first."
234+
)
235+
LOGGER.info(f"Collecting verse counts from {project_name}.")
236+
collect_verse_counts_wrapper(project_name, config.get("verse_counts", {}))
237+
238+
if args.wildebeest:
239+
if not args.extract_corpora:
240+
LOGGER.warning(
241+
"--extract_corpora was not included. Wildebeest requires the corpus to be extracted first."
242+
)
243+
244+
extract_file = list(SIL_NLP_ENV.mt_scripture_dir.glob(f"*{project_name}.txt"))[0]
245+
extract_file = str(extract_file)
246+
LOGGER.info(f"Running Wildebeest analysis on {extract_file}.")
247+
wildebeest_config = config.get("wildebeest", {})
248+
old_argv = sys.argv
249+
try:
250+
sys.argv = [
251+
"wb_ana",
252+
"-i",
253+
extract_file,
254+
"-j",
255+
f"{project_name}_wildebeest.json",
256+
"-o",
257+
f"{project_name}_wildebeest.txt",
258+
"-x",
259+
str(wildebeest_config.get("max_examples", 500)),
260+
"-n",
261+
str(wildebeest_config.get("max_cases", 500)),
262+
"-r",
263+
str(wildebeest_config.get("ref_id_file", "silnlp/assets/vref.txt")),
264+
]
265+
wb_ana.main()
266+
finally:
267+
sys.argv = old_argv
240268

241269

242270
if __name__ == "__main__":

0 commit comments

Comments
 (0)