|
1 | 1 | import argparse |
| 2 | +import getpass |
2 | 3 | import logging |
| 4 | +import tempfile |
| 5 | +import zipfile |
| 6 | +from datetime import datetime |
3 | 7 | from pathlib import Path |
4 | 8 |
|
| 9 | +import wildebeest.wb_analysis as wb_ana |
5 | 10 | import yaml |
6 | 11 |
|
| 12 | +from scripts.clean_projects import clean_projects |
| 13 | + |
7 | 14 | from .collect_verse_counts import collect_verse_counts |
8 | 15 | from .environment import SIL_NLP_ENV |
9 | 16 | from .extract_corpora import extract_corpora |
@@ -58,6 +65,50 @@ def copy_paratext_project_folder(source_dir: Path, project_name: str, overwrite= |
58 | 65 | _copy_file_to_paratext_project(source_item, target_item, overwrite=overwrite) |
59 | 66 |
|
60 | 67 |
|
| 68 | +def collect_verse_counts_wrapper(project_name: str, verse_counts_config: dict) -> None: |
| 69 | + |
| 70 | + output_folder = Path( |
| 71 | + verse_counts_config.get("output_folder", SIL_NLP_ENV.mt_experiments_dir / "verse_counts" / project_name) |
| 72 | + ) |
| 73 | + if not output_folder.exists(): |
| 74 | + output_folder.mkdir(parents=True, exist_ok=True) |
| 75 | + |
| 76 | + input_folder = verse_counts_config.get("input_folder", SIL_NLP_ENV.mt_scripture_dir) |
| 77 | + |
| 78 | + file_patterns = verse_counts_config.get("files", f"*{project_name}*.txt") |
| 79 | + |
| 80 | + input_folder_path = Path(input_folder) |
| 81 | + if not input_folder_path.exists(): |
| 82 | + LOGGER.error(f"Input folder '{input_folder_path}' does not exist. Skipping verse counts collection.") |
| 83 | + return |
| 84 | + |
| 85 | + matched_files = list(input_folder_path.glob(file_patterns)) |
| 86 | + if not matched_files: |
| 87 | + LOGGER.error( |
| 88 | + f"No files matching pattern '{file_patterns}' found in '{input_folder_path}'. Skipping verse counts collection." |
| 89 | + ) |
| 90 | + return |
| 91 | + |
| 92 | + collect_verse_counts( |
| 93 | + input_folder=input_folder_path, |
| 94 | + output_folder=output_folder, |
| 95 | + file_patterns=file_patterns, |
| 96 | + deutero=verse_counts_config.get("deutero", False), |
| 97 | + recount=verse_counts_config.get("recount", False), |
| 98 | + ) |
| 99 | + |
| 100 | + |
| 101 | +def get_config(config_path: str) -> dict: |
| 102 | + if config_path: |
| 103 | + config_file = Path(config_path) |
| 104 | + if not config_file.exists(): |
| 105 | + raise FileNotFoundError(f"Config file '{config_file}' does not exist.") |
| 106 | + with config_file.open("r", encoding="utf-8") as file: |
| 107 | + return yaml.safe_load(file) |
| 108 | + else: |
| 109 | + return {} |
| 110 | + |
| 111 | + |
61 | 112 | def main() -> None: |
62 | 113 | parser = argparse.ArgumentParser( |
63 | 114 | description="Performs several steps to onboard a new project before training a model.", |
@@ -97,84 +148,95 @@ def main() -> None: |
97 | 148 | action="store_true", |
98 | 149 | help="Collect various counts from the extracted Paratext project.", |
99 | 150 | ) |
| 151 | + parser.add_argument( |
| 152 | + "--clean-project", |
| 153 | + default=False, |
| 154 | + action="store_true", |
| 155 | + help="Cleans the Paratext project folder by removing unnecessary files and folders.", |
| 156 | + ) |
| 157 | + parser.add_argument( |
| 158 | + "--timestamp", |
| 159 | + default=False, |
| 160 | + action="store_true", |
| 161 | + help="Add a timestamp to the project folder name when creating a new Paratext project folder.", |
| 162 | + ) |
| 163 | + parser.add_argument( |
| 164 | + "--wildebeest", default=False, action="store_true", help="Run Wildebeest analysis on the extracted corpora." |
| 165 | + ) |
100 | 166 |
|
101 | 167 | args = parser.parse_args() |
102 | 168 | if not args.project: |
103 | 169 | raise ValueError("Project name is required. Please provide a valid Paratext project name using <project>.") |
104 | 170 |
|
| 171 | + config = get_config(args.config) if args.config else {} |
| 172 | + |
| 173 | + if args.project.endswith(".zip"): |
| 174 | + with zipfile.ZipFile(args.project, "r") as zip_ref: |
| 175 | + # Check if any file in the zip is encrypted |
| 176 | + temp_dir = tempfile.TemporaryDirectory() |
| 177 | + needs_password = any(zinfo.flag_bits & 0x1 for zinfo in zip_ref.infolist()) |
| 178 | + if needs_password: |
| 179 | + pwd = getpass.getpass(prompt=f"Enter password for zip file '{args.project}': ") |
| 180 | + zip_ref.extractall(temp_dir.name, pwd=pwd.encode()) |
| 181 | + else: |
| 182 | + zip_ref.extractall(temp_dir.name) |
| 183 | + args.copy_from = temp_dir.name |
| 184 | + args.project = Path(args.project).stem |
| 185 | + |
105 | 186 | project_name = args.project |
| 187 | + if args.timestamp: |
| 188 | + |
| 189 | + now = datetime.now() |
| 190 | + timestamp = now.strftime("%Y_%m_%d") |
| 191 | + project_name = f"{args.project}_{timestamp}" |
| 192 | + LOGGER.info(f"Timestamping project. New project name: {project_name}") |
106 | 193 |
|
107 | 194 | if args.copy_from: |
108 | 195 | LOGGER.info(f"Onboarding project: {args.project}") |
109 | 196 | paratext_project_dir: Path = create_paratext_project_folder_if_not_exists(project_name) |
110 | 197 | copy_paratext_project_folder(Path(args.copy_from), paratext_project_dir, overwrite=args.overwrite) |
111 | 198 |
|
112 | | - if args.config: |
113 | | - config_file = Path(args.config) |
114 | | - if not config_file.exists(): |
115 | | - raise FileNotFoundError(f"Config file '{config_file}' does not exist.") |
116 | | - with config_file.open("r", encoding="utf-8") as file: |
117 | | - config = yaml.safe_load(file) |
118 | | - else: |
119 | | - raise ValueError("Config file is required. Please provide a valid configuration file using --config.") |
| 199 | + if args.clean_project: |
| 200 | + LOGGER.info(f"Cleaning Paratext project folder for {project_name}.") |
| 201 | + clean_projects( |
| 202 | + argparse.Namespace( |
| 203 | + input=get_paratext_project_dir(project_name), |
| 204 | + delete_subfolders=True, |
| 205 | + confirm_delete=True, |
| 206 | + dry_run=False, |
| 207 | + ) |
| 208 | + ) |
120 | 209 |
|
121 | 210 | if args.extract_corpora: |
122 | | - LOGGER.info(f"Extracting {project_name}.") |
| 211 | + extract_config = config.get("extract_corpora", {}) |
123 | 212 | extract_corpora( |
124 | 213 | projects={project_name}, |
125 | | - books_to_include=config["extract_corpora"]["include"] if "include" in config["extract_corpora"] else [], |
126 | | - books_to_exclude=config["extract_corpora"]["exclude"] if "exclude" in config["extract_corpora"] else [], |
127 | | - include_markers=(config["extract_corpora"]["markers"] if "markers" in config["extract_corpora"] else False), |
128 | | - extract_lemmas=config["extract_corpora"]["lemmas"] if "lemmas" in config["extract_corpora"] else False, |
129 | | - extract_project_vrefs=( |
130 | | - config["extract_corpora"]["project-vrefs"] if "project-vrefs" in config["extract_corpora"] else False |
131 | | - ), |
| 214 | + books_to_include=extract_config.get("include", []), |
| 215 | + books_to_exclude=extract_config.get("exclude", []), |
| 216 | + include_markers=extract_config.get("markers", False), |
| 217 | + extract_lemmas=extract_config.get("lemmas", False), |
| 218 | + extract_project_vrefs=extract_config.get("project-vrefs", False), |
132 | 219 | ) |
133 | 220 |
|
134 | 221 | if args.collect_verse_counts: |
135 | 222 | if not args.extract_corpora: |
136 | 223 | LOGGER.warning( |
137 | 224 | "--extract_corpora was not included. Collecting verse counts requires the corpus to be extracted first." |
138 | 225 | ) |
139 | | - |
140 | 226 | LOGGER.info(f"Collecting verse counts from {project_name}.") |
| 227 | + collect_verse_counts_wrapper(project_name, config.get("verse_counts", {})) |
141 | 228 |
|
142 | | - if config["verse_counts"]["output_folder"]: |
143 | | - output_folder = Path(config["verse_counts"]["output_folder"]) |
144 | | - if not output_folder.exists(): |
145 | | - output_folder.mkdir(parents=True, exist_ok=True) |
146 | | - else: |
147 | | - output_folder = SIL_NLP_ENV.mt_experiments_dir / "verse_counts" / project_name |
148 | | - if not output_folder.exists(): |
149 | | - output_folder.mkdir(parents=True, exist_ok=True) |
150 | | - input_folder = ( |
151 | | - config["verse_counts"]["input_folder"] |
152 | | - if "input_folder" in config["verse_counts"] |
153 | | - else SIL_NLP_ENV.mt_scripture_dir |
154 | | - ) |
155 | | - file_patterns = ( |
156 | | - config["verse_counts"]["files"] if "files" in config["verse_counts"] else f"*{project_name}*.txt" |
157 | | - ) |
158 | | - |
159 | | - input_folder_path = Path(input_folder) |
160 | | - if not input_folder_path.exists(): |
161 | | - LOGGER.error(f"Input folder '{input_folder_path}' does not exist. Skipping verse counts collection.") |
162 | | - return |
163 | | - |
164 | | - matched_files = list(input_folder_path.glob(file_patterns)) |
165 | | - if not matched_files: |
166 | | - LOGGER.error( |
167 | | - f"No files matching pattern '{file_patterns}' found in '{input_folder_path}'. Skipping verse counts collection." |
168 | | - ) |
169 | | - return |
170 | | - |
171 | | - collect_verse_counts( |
172 | | - input_folder=input_folder_path, |
173 | | - output_folder=output_folder, |
174 | | - file_patterns=file_patterns, |
175 | | - deutero=config["verse_counts"]["deutero"] if "deutero" in config["verse_counts"] else False, |
176 | | - recount=config["verse_counts"]["recount"] if "recount" in config["verse_counts"] else False, |
177 | | - ) |
| 229 | + if args.wildebeest: |
| 230 | + if not args.extract_corpora: |
| 231 | + LOGGER.warning("--extract_corpora was not included. Wildebeest requires the corpus to be extracted first.") |
| 232 | + |
| 233 | + extract_file = list(SIL_NLP_ENV.mt_scripture_dir.glob(f"*{project_name}.txt"))[0] |
| 234 | + LOGGER.info(f"Running Wildebeest analysis on {extract_file}.") |
| 235 | + with ( |
| 236 | + open(f"{project_name}_wildebeest.json", "w", encoding="utf-8") as json_f, |
| 237 | + open(f"{project_name}_wildebeest.txt", "w", encoding="utf-8") as txt_f, |
| 238 | + ): |
| 239 | + wb_ana.process(in_file=extract_file, json_output=json_f, pp_output=txt_f) |
178 | 240 |
|
179 | 241 |
|
180 | 242 | if __name__ == "__main__": |
|
0 commit comments