Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 184 additions & 0 deletions calculator/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,190 @@ def cpb_calculation(seq, cps_table):
return CPB


def _load_codon_usage_table(table_path):
"""
Load a codon usage frequency table from a JSON file.

Parameters
----------
table_path : str
Path to a JSON file mapping codons to {"aa": ..., "freq_per_1000": ...}.

Returns
-------
dict
The parsed codon usage table.
"""
with open(table_path, 'r') as f:
return json.load(f)


def _get_expected_distribution(codon_table, amino_acid):
"""
Get the expected codon frequency distribution for a given amino acid
from the organism's codon usage table.

Parameters
----------
codon_table : dict
Codon usage table mapping codons to {"aa": ..., "freq_per_1000": ...}.
amino_acid : str
Three-letter amino acid code (e.g., "Phe", "Leu").

Returns
-------
dict
Mapping from codon to its expected relative frequency (normalized to sum=1).
"""
codons_for_aa = {
codon: entry["freq_per_1000"]
for codon, entry in codon_table.items()
if entry["aa"] == amino_acid
}
total = sum(codons_for_aa.values())
if total == 0:
return {}
return {codon: freq / total for codon, freq in codons_for_aa.items()}


def cufd_kl_divergence(seq, codon_table_path):
"""
Calculate the Codon Usage Frequency Distribution (CUFD) score using
KL divergence between the observed codon distribution in the sequence
and the expected distribution from a target organism's codon usage table.

A lower score indicates better match to the target organism's codon usage.

Parameters
----------
seq : str
cDNA/mRNA sequence (must be divisible by 3).
codon_table_path : str
Path to the organism codon usage JSON file.

Returns
-------
float
The average KL divergence across all amino acids present in the sequence.
Returns 0.0 for empty sequences or sequences with no valid codons.
"""
seq = seq.upper().replace('T', 'U')
codons = [seq[i:i+3] for i in range(0, len(seq), 3) if len(seq[i:i+3]) == 3]

if not codons:
return 0.0

codon_table = _load_codon_usage_table(codon_table_path)

# Group codons by amino acid
aa_codon_counts = {}
for codon in codons:
if codon not in codon_table:
continue
aa = codon_table[codon]["aa"]
if aa == "Stop":
continue
if aa not in aa_codon_counts:
aa_codon_counts[aa] = {}
aa_codon_counts[aa][codon] = aa_codon_counts[aa].get(codon, 0) + 1

if not aa_codon_counts:
return 0.0

# Compute KL divergence per amino acid
epsilon = 1e-10
kl_values = []
for aa, observed_counts in aa_codon_counts.items():
expected_dist = _get_expected_distribution(codon_table, aa)
if not expected_dist:
continue

total_observed = sum(observed_counts.values())
observed_dist = {
codon: observed_counts.get(codon, 0) / total_observed
for codon in expected_dist
}

kl = 0.0
for codon in expected_dist:
p = observed_dist.get(codon, 0) + epsilon
q = expected_dist[codon] + epsilon
kl += p * np.log(p / q)

kl_values.append(kl)

return float(np.mean(kl_values)) if kl_values else 0.0


def cufd_cosine_similarity(seq, codon_table_path):
"""
Calculate the Codon Usage Frequency Distribution (CUFD) score using
cosine similarity between the observed codon distribution in the sequence
and the expected distribution from a target organism's codon usage table.

A higher score (closer to 1.0) indicates better match to the target
organism's codon usage.

Parameters
----------
seq : str
cDNA/mRNA sequence (must be divisible by 3).
codon_table_path : str
Path to the organism codon usage JSON file.

Returns
-------
float
The average cosine similarity across all amino acids present in the
sequence. Range: [0.0, 1.0]. Returns 0.0 for empty sequences.
"""
seq = seq.upper().replace('T', 'U')
codons = [seq[i:i+3] for i in range(0, len(seq), 3) if len(seq[i:i+3]) == 3]

if not codons:
return 0.0

codon_table = _load_codon_usage_table(codon_table_path)

# Group codons by amino acid
aa_codon_counts = {}
for codon in codons:
if codon not in codon_table:
continue
aa = codon_table[codon]["aa"]
if aa == "Stop":
continue
if aa not in aa_codon_counts:
aa_codon_counts[aa] = {}
aa_codon_counts[aa][codon] = aa_codon_counts[aa].get(codon, 0) + 1

if not aa_codon_counts:
return 0.0

cos_values = []
for aa, observed_counts in aa_codon_counts.items():
expected_dist = _get_expected_distribution(codon_table, aa)
if not expected_dist:
continue

total_observed = sum(observed_counts.values())

# Build vectors in the same codon order
obs_vec = np.array([observed_counts.get(c, 0) / total_observed for c in expected_dist])
exp_vec = np.array([expected_dist[c] for c in expected_dist])

dot = np.dot(obs_vec, exp_vec)
norm_obs = np.linalg.norm(obs_vec)
norm_exp = np.linalg.norm(exp_vec)

if norm_obs == 0 or norm_exp == 0:
cos_values.append(0.0)
else:
cos_values.append(float(dot / (norm_obs * norm_exp)))

return float(np.mean(cos_values)) if cos_values else 0.0


# input: directory
def sequences_from_dir(path, pattern=".fasta", output_file="mfe_table.csv"):
seqs = []
Expand Down
66 changes: 66 additions & 0 deletions calculator/data/codon_usage_human.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{
"UUU": {"aa": "Phe", "freq_per_1000": 17.6},
"UUC": {"aa": "Phe", "freq_per_1000": 20.3},
"UUA": {"aa": "Leu", "freq_per_1000": 7.7},
"UUG": {"aa": "Leu", "freq_per_1000": 12.9},
"CUU": {"aa": "Leu", "freq_per_1000": 13.2},
"CUC": {"aa": "Leu", "freq_per_1000": 19.6},
"CUA": {"aa": "Leu", "freq_per_1000": 7.2},
"CUG": {"aa": "Leu", "freq_per_1000": 39.6},
"AUU": {"aa": "Ile", "freq_per_1000": 16.0},
"AUC": {"aa": "Ile", "freq_per_1000": 20.8},
"AUA": {"aa": "Ile", "freq_per_1000": 7.5},
"AUG": {"aa": "Met", "freq_per_1000": 22.0},
"GUU": {"aa": "Val", "freq_per_1000": 11.0},
"GUC": {"aa": "Val", "freq_per_1000": 14.5},
"GUA": {"aa": "Val", "freq_per_1000": 7.1},
"GUG": {"aa": "Val", "freq_per_1000": 28.1},
"UCU": {"aa": "Ser", "freq_per_1000": 15.2},
"UCC": {"aa": "Ser", "freq_per_1000": 17.7},
"UCA": {"aa": "Ser", "freq_per_1000": 12.2},
"UCG": {"aa": "Ser", "freq_per_1000": 4.4},
"CCU": {"aa": "Pro", "freq_per_1000": 17.5},
"CCC": {"aa": "Pro", "freq_per_1000": 19.8},
"CCA": {"aa": "Pro", "freq_per_1000": 16.9},
"CCG": {"aa": "Pro", "freq_per_1000": 6.9},
"ACU": {"aa": "Thr", "freq_per_1000": 13.1},
"ACC": {"aa": "Thr", "freq_per_1000": 18.9},
"ACA": {"aa": "Thr", "freq_per_1000": 15.1},
"ACG": {"aa": "Thr", "freq_per_1000": 6.1},
"GCU": {"aa": "Ala", "freq_per_1000": 18.4},
"GCC": {"aa": "Ala", "freq_per_1000": 27.7},
"GCA": {"aa": "Ala", "freq_per_1000": 15.8},
"GCG": {"aa": "Ala", "freq_per_1000": 7.4},
"UAU": {"aa": "Tyr", "freq_per_1000": 12.2},
"UAC": {"aa": "Tyr", "freq_per_1000": 15.3},
"UAA": {"aa": "Stop", "freq_per_1000": 1.0},
"UAG": {"aa": "Stop", "freq_per_1000": 0.8},
"CAU": {"aa": "His", "freq_per_1000": 10.9},
"CAC": {"aa": "His", "freq_per_1000": 15.1},
"CAA": {"aa": "Gln", "freq_per_1000": 12.3},
"CAG": {"aa": "Gln", "freq_per_1000": 34.2},
"AAU": {"aa": "Asn", "freq_per_1000": 17.0},
"AAC": {"aa": "Asn", "freq_per_1000": 19.1},
"AAA": {"aa": "Lys", "freq_per_1000": 24.4},
"AAG": {"aa": "Lys", "freq_per_1000": 31.9},
"GAU": {"aa": "Asp", "freq_per_1000": 21.8},
"GAC": {"aa": "Asp", "freq_per_1000": 25.1},
"GAA": {"aa": "Glu", "freq_per_1000": 29.0},
"GAG": {"aa": "Glu", "freq_per_1000": 39.6},
"UGU": {"aa": "Cys", "freq_per_1000": 10.6},
"UGC": {"aa": "Cys", "freq_per_1000": 12.6},
"UGA": {"aa": "Stop", "freq_per_1000": 1.6},
"UGG": {"aa": "Trp", "freq_per_1000": 13.2},
"CGU": {"aa": "Arg", "freq_per_1000": 4.5},
"CGC": {"aa": "Arg", "freq_per_1000": 10.4},
"CGA": {"aa": "Arg", "freq_per_1000": 6.2},
"CGG": {"aa": "Arg", "freq_per_1000": 11.4},
"AGA": {"aa": "Arg", "freq_per_1000": 12.2},
"AGG": {"aa": "Arg", "freq_per_1000": 12.0},
"AGU": {"aa": "Ser", "freq_per_1000": 12.1},
"AGC": {"aa": "Ser", "freq_per_1000": 19.5},
"GGU": {"aa": "Gly", "freq_per_1000": 10.8},
"GGC": {"aa": "Gly", "freq_per_1000": 22.2},
"GGA": {"aa": "Gly", "freq_per_1000": 16.5},
"GGG": {"aa": "Gly", "freq_per_1000": 16.5}
}
Loading