InscriptaLabs
diff --git a/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎inscripta/biocantor/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎inscripta/biocantor/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎inscripta/biocantor/io/genbank/exc.py‎
Lines changed: 8 additions & 0 deletions b/‎inscripta/biocantor/io/genbank/exc.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎inscripta/biocantor/io/genbank/parser.py‎
Lines changed: 105 additions & 41 deletions b/‎inscripta/biocantor/io/genbank/parser.py‎
Lines changed: 105 additions & 41 deletions
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [0.9.0] 2021-09-03
+### Fixed
+- `LocusTag` parser is now able to handle CDS-only features, and have more informative error reporting for locus tag collisions. 
+- GenBank parser no longer raises an exception for multi-stranded feature; instead it warns and moves on
+
 ## [0.8.0]
 ### Fixed
 - Do not trust feature type annotations to define coding vs. non-coding when parsing GenBank files; only rely on the presence/absence of CDS intervals associated with the transcript.
 
@@ -1,4 +1,4 @@
-__version__ = "0.8.0"
+__version__ = "0.9.0"
 
 from abc import ABC, abstractmethod
 from enum import Enum
 
@@ -35,3 +35,11 @@ class GenBankLocationException(GenBankParserError):
     """
 
     pass
+
+
+class GenBankNullStrandException(GenBankParserError):
+    """
+    Raised when BioPython constructs a feature with a null Strand.
+    """
+
+    pass
@@ -21,8 +21,8 @@
 :meth:`GeneFeature.to_gene_model()`. This function can be over-ridden to provide custom parsing implementations.
 """
 import itertools
-import logging
 import pathlib
+import warnings
 from abc import ABC
 from collections import Counter
 from copy import deepcopy
@@ -31,8 +31,8 @@
 from Bio import SeqIO
 from Bio.SeqFeature import SeqFeature
 from Bio.SeqRecord import SeqRecord
-
 from inscripta.biocantor.gene import Biotype, CDSInterval, CDSFrame
+from inscripta.biocantor.io.exc import StrandViolationWarning
 from inscripta.biocantor.io.features import extract_feature_types, extract_feature_name_id, merge_qualifiers
 from inscripta.biocantor.io.genbank.constants import (
     GeneFeatures,
@@ -48,6 +48,7 @@
     EmptyGenBankError,
     GenBankLocusTagError,
     GenBankLocationException,
+    GenBankNullStrandException,
 )
 from inscripta.biocantor.io.models import (
     GeneIntervalModel,
@@ -63,8 +64,6 @@
     EmptyLocation,
 )
 
-logger = logging.getLogger(__name__)
-
 
 class Feature(ABC):
     """Generic feature."""
@@ -77,7 +76,7 @@ def __init__(self, feature: SeqFeature, record: SeqRecord):
         if feature.location is None:
             raise GenBankLocationException(f"Feature {feature} did not have parseable coordinates.")
         if not feature.strand:
-            raise GenBankParserError(f"Feature {feature} is unstranded or has multiple strands.")
+            raise GenBankNullStrandException(f"Feature {feature} is unstranded or has multiple strands.")
         self.feature = feature
         self.record = record
         self.children = []
@@ -220,7 +219,7 @@ def from_transcript_or_cds_feature(feature: SeqFeature, seqrecord: SeqRecord) ->
 
         Construct a GeneFeature from such records."""
         old_type = feature.type
-        feature.type = "gene"
+        feature.type = GeneFeatures.GENE.value
         gene = GeneFeature(feature, seqrecord)
         feature.type = old_type
         gene.add_child(feature)
@@ -232,7 +231,6 @@ def add_child(self, feature: SeqFeature):
             self.children.append(TranscriptFeature(feature, self.record))
         elif feature.type in IntervalFeature.types:
             # infer a transcript
-            logger.debug(f"Inferring a transcript for {feature}")
             tx_feature = deepcopy(feature)
             if tx_feature.type == GeneIntervalFeatures.CDS.value:
                 tx_feature.type = TranscriptFeatures.CODING_TRANSCRIPT.value
@@ -307,10 +305,10 @@ def to_gene_model(cls: "GeneFeature") -> Dict[str, Any]:
                 cds_frames=cds_frames,
                 qualifiers=tx.merge_cds_qualifiers_to_transcript(),
                 is_primary_tx=False,
-                transcript_id=tx.get_qualifier_from_tx_or_cds_features("transcript_id"),
-                protein_id=tx.get_qualifier_from_tx_or_cds_features("protein_id"),
-                product=tx.get_qualifier_from_tx_or_cds_features("product"),
-                transcript_symbol=tx.get_qualifier_from_tx_or_cds_features("gene"),
+                transcript_id=tx.get_qualifier_from_tx_or_cds_features(KnownQualifiers.TRANSCRIPT_ID.value),
+                protein_id=tx.get_qualifier_from_tx_or_cds_features(KnownQualifiers.PROTEIN_ID.value),
+                product=tx.get_qualifier_from_tx_or_cds_features(KnownQualifiers.PRODUCT.value),
+                transcript_symbol=tx.get_qualifier_from_tx_or_cds_features(KnownQualifiers.GENE.value),
                 transcript_type=transcript_biotype.name,
                 sequence_name=tx.record.id,
             )
@@ -321,9 +319,9 @@ def to_gene_model(cls: "GeneFeature") -> Dict[str, Any]:
         gene = GeneIntervalModel.Schema().load(
             dict(
                 transcripts=transcripts,
-                gene_id=cls.feature.qualifiers.get("gene_id", [None])[0],
-                gene_symbol=cls.feature.qualifiers.get("gene", [None])[0],
-                locus_tag=cls.feature.qualifiers.get("locus_tag", [None])[0],
+                gene_id=cls.feature.qualifiers.get(KnownQualifiers.GENE_ID.value, [None])[0],
+                gene_symbol=cls.feature.qualifiers.get(KnownQualifiers.GENE.value, [None])[0],
+                locus_tag=cls.feature.qualifiers.get(KnownQualifiers.LOCUS_TAG.value, [None])[0],
                 gene_type=gene_biotype.name,
                 qualifiers=cls.feature.qualifiers,
                 sequence_name=cls.record.id,
@@ -358,7 +356,7 @@ def infer_exon_features(self):
     def construct_frames(self, cds_interval: Location) -> List[str]:
         """We need to build frames. Since GenBank lacks this info, do our best"""
         # make 0 based offset, if possible, otherwise assume always in frame
-        frame = int(self.children[0].feature.qualifiers.get("codon_start", [1])[0]) - 1
+        frame = int(self.children[0].feature.qualifiers.get(KnownQualifiers.CODON_START.value, [1])[0]) - 1
         frame = CDSFrame.from_int(frame)
         frames = CDSInterval.construct_frames_from_location(cds_interval, frame)
         return [x.name for x in frames]
@@ -396,7 +394,7 @@ def find_cds_interval(self, exon_interval: Location) -> Location:
         cds_i = SingleInterval(
             cds[0].feature.location.nofuzzy_start,
             cds[-1].feature.location.nofuzzy_end,
-            Strand.from_int(self.feature.strand),
+            Strand.from_int(self.strand),
         )
         return exon_interval.intersection(cds_i)
 
@@ -425,6 +423,47 @@ def __str__(self):
         return f"----> {self.feature.__repr__()}"
 
 
+def _construct_gene_from_feature(
+    feature: SeqFeature,
+    seqrecord: SeqRecord,
+    cls_or_fn: Callable[[SeqFeature, SeqRecord], GeneFeature],
+) -> Optional[GeneFeature]:
+    """
+    Convenience function for producing :class:`GeneFeature` from a `SeqFeature`, handling both
+    possible constructor routes (construction from a ``gene`` feature as found in the GenBank,
+    or inference from a transcript/interval level feature in case no ``gene`` level feature was found).
+
+    This wrapper function catches exceptions raised for common errors, and converts them to warnings
+    as appropriate.
+    """
+    try:
+        return cls_or_fn(feature, seqrecord)
+    except GenBankNullStrandException:
+        warnings.warn(
+            StrandViolationWarning(f"Found multiple strands for feature {feature}. This feature will be skipped.")
+        )
+
+
+def _construct_feature_collection_from_features(
+    features: List[SeqFeature],
+    seqrecord: SeqRecord,
+) -> Optional[FeatureIntervalGenBankCollection]:
+    """
+    Convenience function for producing :class:`FeatureIntervalGenBankCollection` from a `SeqFeature`.
+
+    This wrapper function catches exceptions raised for common errors, and converts them to warnings
+    as appropriate.
+    """
+    try:
+        return FeatureIntervalGenBankCollection(features, seqrecord)
+    except GenBankNullStrandException:
+        warnings.warn(
+            StrandViolationWarning(
+                f"Found multiple strands for feature group {features}. " f"This feature collection will be skipped."
+            )
+        )
+
+
 def parse_genbank(
     genbank_handle_or_path: Union[TextIO, str, pathlib.Path],
     parse_func: Optional[Callable[[GeneFeature], Dict[str, Any]]] = GeneFeature.to_gene_model,
@@ -514,15 +553,20 @@ def group_gene_records_from_sorted_genbank(
             # base case for start; iterate until we find a gene
             elif gene is None:
                 if feature.type in GeneFeature.types:
-                    gene = GeneFeature(feature, seqrecord)
+                    gene = _construct_gene_from_feature(feature, seqrecord, GeneFeature)
+                    # gene is None if it was not parseable
+                    if not gene:
+                        continue
                 # base case for starting with a isolated ncRNA or CDS feature; immediately add them
                 # and reset the gene to None
                 elif feature.type in TranscriptFeature.types or feature.type in IntervalFeature.types:
-                    gene = GeneFeature.from_transcript_or_cds_feature(feature, seqrecord)
-                    gene.finalize()
-                    gene = parse_func(gene)
-                    genes.append(gene)
-                    gene = None
+                    gene = _construct_gene_from_feature(feature, seqrecord, GeneFeature.from_transcript_or_cds_feature)
+                    # gene is None if it was not parseable
+                    if gene:
+                        gene.finalize()
+                        gene = parse_func(gene)
+                        genes.append(gene)
+                        gene = None
                 # this must be a generic feature
                 else:
                     feature_features.append(feature)
@@ -532,15 +576,20 @@ def group_gene_records_from_sorted_genbank(
                     gene.finalize()
                     gene = parse_func(gene)
                     genes.append(gene)
-                gene = GeneFeature(feature, seqrecord)
+                gene = _construct_gene_from_feature(feature, seqrecord, GeneFeature)
+                if not gene:
+                    continue
             elif feature.type in TranscriptFeature.types:
                 # if the current gene is non-empty, and the feature is not a mRNA, then this is a isolated ncRNA
                 # finish this gene and start a new one
                 if feature.type != TranscriptFeatures.CODING_TRANSCRIPT and gene.has_children:
                     gene.finalize()
                     gene = parse_func(gene)
                     genes.append(gene)
-                    gene = GeneFeature.from_transcript_or_cds_feature(feature, seqrecord)
+                    gene = _construct_gene_from_feature(feature, seqrecord, GeneFeature.from_transcript_or_cds_feature)
+                    # gene is None if it was not parseable
+                    if not gene:
+                        continue
                 else:
                     gene.add_child(feature)
             elif feature.type in IntervalFeature.types:
@@ -619,31 +668,42 @@ def group_gene_records_by_locus_tag(
         remaining_features = []
         source = None
         for f in seqrecord.features:
-            if f.type in GENBANK_GENE_FEATURES and "locus_tag" in f.qualifiers:
+            if f.type in GENBANK_GENE_FEATURES and KnownQualifiers.LOCUS_TAG.value in f.qualifiers:
                 gene_filtered_features.append(f)
             elif f.type == MetadataFeatures.SOURCE.value:
                 source = f
             else:
                 remaining_features.append(f)
 
-        sorted_gene_filtered_features = sorted(gene_filtered_features, key=lambda f: f.qualifiers["locus_tag"])
+        sorted_gene_filtered_features = sorted(
+            gene_filtered_features, key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value]
+        )
 
         genes = []
         for locus_tag, gene_features in itertools.groupby(
-            sorted_gene_filtered_features, key=lambda f: f.qualifiers["locus_tag"][0]
+            sorted_gene_filtered_features, key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value][0]
         ):
             # sort the features for this locus tag to bubble the "gene" feature to the top, if it exists
-            gene_features = sorted(gene_features, key=lambda f: f.type != "gene")
-            gene = gene_features[0]
+            gene_features = sorted(gene_features, key=lambda f: f.type != GeneFeatures.GENE.value)
 
-            if gene.type not in GeneFeature.types:
-                raise GenBankLocusTagError("Grouping by locus tag produced a mis-ordered interpretation")
-            gene = GeneFeature(gene, seqrecord)
+            # do we have more than one gene with this locus_tag?
+            if len(gene_features) > 1 and gene_features[1].type == GeneFeatures.GENE.value:
+                raise GenBankLocusTagError(
+                    f"Grouping by locus tag found multiple gene features with the same locus tag:"
+                    f"\n{gene_features[0]}\n{gene_features[1]}"
+                )
+
+            gene_feature = gene_features[0]
+            if gene_feature.type == GeneFeatures.GENE.value:
+                gene = _construct_gene_from_feature(gene_feature, seqrecord, GeneFeature)
+            else:
+                gene = _construct_gene_from_feature(gene_feature, seqrecord, GeneFeature.from_transcript_or_cds_feature)
+            # gene is None if it was not parseable
+            if not gene:
+                continue
 
             for feature in gene_features[1:]:
-                if feature.type in GeneFeature.types:
-                    raise GenBankLocusTagError("Grouping by locus tag found two genes")
-                elif feature.type in TranscriptFeature.types:
+                if feature.type in TranscriptFeature.types:
                     gene.add_child(feature)
                 elif feature.type in IntervalFeature.types:
                     if len(gene.children) == 0:
@@ -723,19 +783,23 @@ def _extract_generic_features(
     """
 
     # sort by locus tag, or null if no locus tag is provided.
-    sorted_filtered_features = sorted(filtered_features, key=lambda f: f.qualifiers.get("locus_tag", [""])[0])
+    sorted_filtered_features = sorted(
+        filtered_features, key=lambda f: f.qualifiers.get(KnownQualifiers.LOCUS_TAG.value, [""])[0]
+    )
 
     feature_collections = []
     for locus_tag, features in itertools.groupby(
-        sorted_filtered_features, key=lambda f: f.qualifiers.get("locus_tag", [""])[0]
+        sorted_filtered_features, key=lambda f: f.qualifiers.get(KnownQualifiers.LOCUS_TAG.value, [""])[0]
     ):
         if not locus_tag:
             # we are in the null scenario, meaning that there are no locus tag information and thus no groupings.
             for feature in features:
-                feature_collection = FeatureIntervalGenBankCollection([feature], seqrecord)
-                feature_collections.append(feature_collection)
+                feature_collection = _construct_feature_collection_from_features([feature], seqrecord)
+                if feature_collection:
+                    feature_collections.append(feature_collection)
         else:
-            feature_collection = FeatureIntervalGenBankCollection(list(features), seqrecord)
-            feature_collections.append(feature_collection)
+            feature_collection = _construct_feature_collection_from_features(list(features), seqrecord)
+            if feature_collection:
+                feature_collections.append(feature_collection)
 
     return [feature_parse_func(fc) for fc in feature_collections] if feature_collections else None
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.8.0"`
	`1`	`+__version__ = "0.9.0"`
`2`	`2`
`3`	`3`	`from abc import ABC, abstractmethod`
`4`	`4`	`from enum import Enum`