Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 111 additions & 45 deletions silnlp/common/combine_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
import yaml
from pathlib import Path
import regex as re
from ..common.environment import SIL_NLP_ENV

# List of keywords to exclude from filenames
EXCLUDED_KEYWORDS = ["XRI", "_AI", "train"]
from .environment import SIL_NLP_ENV

# A hardcoded list of major language ISO codes from the Flores-200 benchmark.
# This list can be modified as needed.
Expand All @@ -30,6 +27,9 @@
'urd', 'uz', 'uzb', 'vec', 'war', 'yi', 'yid', 'yue'
}

# List of keywords to exclude from filenames
EXCLUDED_KEYWORDS = ["XRI", "AI"]

def extract_lang_code(corpus_name):
"""
Extracts a 2 or 3 letter ISO code from a corpus name that follows the
Expand All @@ -44,29 +44,29 @@ def combine_config_files(root_folder: Path, output_filename: str = "config.yml")
"""
Finds and combines all config.yml files in subfolders with specific names.
Re-sorts languages, de-duplicates entries, and sets a new aligner.
Filters out older files with dates and files with excluded keywords.
Also filters out older files with dates and files with excluded keywords.
"""
print(f"Searching for config.yml files in subfolders of: {root_folder}")

# Dictionary to hold corpus names, grouped by language code
# Dictionary to hold corpus names, grouped by language code, with dates
corpus_by_lang = {}

# Initialize a base config with defaults
global_config = {
'data': {
'aligner': 'eflomal',
'corpus_pairs': [{
'mapping': 'many_to_many',
'type': 'train',
'src': [],
'trg': [],
'mapping': 'many_to_many',
'test_size': 0,
'type': 'train',
'val_size': 0
}]
}
}
tokenize_setting = None


found_first_config = False

# Find all config.yml files in subdirectories
for config_file in root_folder.rglob('**/config.yml'):
Expand All @@ -84,8 +84,9 @@ def combine_config_files(root_folder: Path, output_filename: str = "config.yml")
with open(config_file, 'r') as f:
config = yaml.safe_load(f)

if tokenize_setting is None and 'data' in config and 'tokenize' in config['data']:
tokenize_setting = config['data']['tokenize']
if not found_first_config and 'data' in config and 'tokenize' in config['data']:
global_config['data']['tokenize'] = config['data']['tokenize']
found_first_config = True

if 'data' in config and 'corpus_pairs' in config['data']:
for pair in config['data']['corpus_pairs']:
Expand All @@ -108,35 +109,26 @@ def combine_config_files(root_folder: Path, output_filename: str = "config.yml")

lang_code = extract_lang_code(corpus)
if lang_code:
if lang_code not in corpus_by_lang:
corpus_by_lang[lang_code] = {'dated': [], 'undated': []}

# Extract date from filename if present
date_match = re.search(r'_(\d{4}_\d{2}_\d{2})', corpus)
if date_match:
date_str = date_match.group(1)

corpus_by_lang[lang_code]['dated'].append((date_str, corpus))
else:
corpus_by_lang[lang_code]['undated'].append(corpus)
date_str = date_match.group(1) if date_match else "0000_00_00" # Use a default date for files without one

if lang_code not in corpus_by_lang:
corpus_by_lang[lang_code] = []
corpus_by_lang[lang_code].append((date_str, corpus))
else:
print(f"Skipping invalid corpus name: {corpus}")

except Exception as e:
print(f"Error processing {config_file}: {e}")

# Filter for the most recent file for each language and include all undated files
# Filter for the most recent file for each language
final_corpora = set()
for lang_code, corpora_dict in corpus_by_lang.items():
# Keep all undated files
for corpus in corpora_dict['undated']:
final_corpora.add(corpus)

# Keep only the most recent dated file, if any exist
if corpora_dict['dated']:
corpora_dict['dated'].sort(key=lambda x: x[0], reverse=True)

final_corpora.add(corpora_dict['dated'][0][1])
for lang_code, corpus_list in corpus_by_lang.items():
# Sort by date in descending order to get the most recent file first
corpus_list.sort(key=lambda x: x[0], reverse=True)
# Add the most recent file to the final set
final_corpora.add(corpus_list[0][1])

# Separate filtered corpora into major and minor languages
major_corpora = set()
Expand All @@ -153,41 +145,115 @@ def combine_config_files(root_folder: Path, output_filename: str = "config.yml")
global_config['data']['corpus_pairs'][0]['src'] = sorted(list(major_corpora)) + sorted(list(minor_corpora))
global_config['data']['corpus_pairs'][0]['trg'] = sorted(list(minor_corpora))

# Add tokenize setting if it was found
if tokenize_setting is not None:
global_config['data']['tokenize'] = tokenize_setting
# Write the combined config to a new file in the root folder
output_path = root_folder / output_filename
output_path = root_folder / 'combined_config.yml'
try:
with open(output_path, 'w') as f:
yaml.dump(global_config, f, sort_keys=False)
print(f"\nSuccessfully wrote combined configuration to: {output_path}")
except Exception as e:
print(f"Failed to write the combined config file: {e}")

def update_config(folder: Path):
import sys
import datetime
config_path = folder / "config.yml"
if not config_path.is_file():
print(f"Error: config.yml not found in {folder}")
sys.exit(1)

# Backup config.yml
today = datetime.date.today().strftime("%Y_%m_%d")
backup_path = folder / f"config_{today}.yml"
config_path.replace(backup_path)
print(f"Backed up config.yml to {backup_path}")

# Load config
with open(backup_path, "r", encoding="utf-8") as f:
config = yaml.safe_load(f)

# Helper to find latest stem in scripture_dir
def find_latest_stem(stem):
# Match pattern: <prefix>_YYYY_MM_DD
m = re.match(r"^(.*)_\d{4}_\d{2}_\d{2}$", stem)
if not m:
return None
prefix = m.group(1)
candidates = []
for file in SIL_NLP_ENV.mt_scripture_dir.glob(f"{prefix}_????_??_??.*"):
# Extract date from filename
file_stem = file.stem
m2 = re.match(rf"^{re.escape(prefix)}_(\d{{4}}_\d{{2}}_\d{{2}})$", file_stem)
if m2:
candidates.append((m2.group(1), file_stem))
if not candidates:
return None
# Sort by date descending
candidates.sort(key=lambda x: x[0], reverse=True)
return candidates[0][1]

updated = False
pairs = config.get("data", {}).get("corpus_pairs", [])
for pair in pairs:
for key in ("src", "trg"):
items = pair.get(key, [])
if isinstance(items, str):
items = [items]
new_items = []
for stem in items:
m = re.match(r"^(.*)_\d{4}_\d{2}_\d{2}$", stem)
if m:
latest_stem = find_latest_stem(stem)
if latest_stem and latest_stem != stem:
print(f"Updating {stem} -> {latest_stem}")
new_items.append(latest_stem)
updated = True
else:
new_items.append(stem)
else:
new_items.append(stem)
pair[key] = new_items if len(new_items) > 1 else (new_items[0] if new_items else [])

if updated:
with open(config_path, "w", encoding="utf-8") as f:
yaml.dump(config, f, sort_keys=False, allow_unicode=True)
print(f"Updated config.yml written to {config_path}")
else:
print("No updates made to config.yml. Restoring original.")
backup_path.replace(config_path)

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Combine multiple config.yml files into one.'
description='Combine multiple config.yml files into one or update config.yml with latest file stems.'
)
parser.add_argument(
'folder',
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument(
'--update-config',
action='store_true',
help='Update config.yml in the given folder with latest file stems.'
)
group.add_argument(
'--output_filename',
type=str,
help='The root folder to search for config.yml files.'
default=None,
help='Output filename for the combined file. The default is config.yml.'
)
parser.add_argument(
"--output_filename",
'folder',
type=str,
default="config.yml",
help="Output filename for the combined file. The default is config.yml.",
help='The root folder to search for config.yml files or to update config.'
)

args = parser.parse_args()
folder = Path(args.folder)

if not folder.is_dir():
folder = Path(SIL_NLP_ENV.mt_experiments_dir) / args.folder

if not folder.is_dir():
print(f"Error: Couldn't find {args.folder} or {folder}.")
elif args.update_config:
update_config(folder)
else:
combine_config_files(folder, args.output_filename)
output_filename = args.output_filename or "config.yml"
combine_config_files(folder, output_filename)