diff --git a/intermine/__init__.py b/intermine/__init__.py index d891f299..8b98a7c7 100644 --- a/intermine/__init__.py +++ b/intermine/__init__.py @@ -1 +1,225 @@ VERSION = "1.11.0" +from intermine import query +from intermine.iterators import * +try: + import simplejson as json +except ImportError: + try: + import json + except ImportError: + raise "No JSON module found - please install simplejson" + +class SequenceDataQuery(object): + + def to_query(self): + """Fulfil the listable query interface""" + return self + + def bed(self, ucsc_compatible=True): + """ + Get results as BED + ================== + Return a BedIterator object, which stringifies to the BED results, + and works as an iterator over the lines. After iteration the header + information is accessible with the iter.header() method + """ + return BedIterator(self.service, self.query, ucsc_compatible) + + def fasta(self): + """ + Get results as FASTA + ==================== + Return a FastaIterator object, which stringifies to the Fasta results, + and works as an iterator over the records (not the lines). + When attempting to get results as FASTA the query may only have a single + output column. Errors will be raised otherwise. + """ + return FastaIterator(self.service, self.query) + + def gff3(self): + """ + Get results as GFF3 + =================== + Return a GFF3Iterator object, which stringifies to the GFF3 results, + and works as an iterator over the lines. After iteration the header + information is accessible with the iter.header() method + """ + return GFF3Iterator(self.service, self.query) + +class _FakeRoot(object): + @property + def name(self): return "fake-root" + +class RegionQuery(SequenceDataQuery): + """ + Class for querying InterMine Webservices for Features in Genomic Intervals + ========================================================================== + This module allows you to construct queries that retrieve data about sequences and + sequence features in biologically relevant formats, where those features are located + overlapping genomic intervals. + The currently supported formats are UCSC-BED, GFF3, and FASTA. + These queries may also be used to construct lists with. + """ + + + LIST_PATH = "/regions/list" + BED_PATH = "/regions/bed" + FASTA_PATH = "/regions/fasta" + GFF3_PATH = "/regions/gff3" + + def __init__(self, service, organism, feature_types, regions, extension=0, is_interbase=False): + """ + Constructor + =========== + >>> s = Service("www.flymine.org/query", "API-KEY") + >>> org = "D. melanogaster" + >>> regions = ["2L:14614843..14619614"] + >>> feature_types = ["Exon", "Intron"] + >>> q = RegionQuery(s, org, feature_types, regions) + + @param service: The service to connect to. + @type service: intermine.webservice.Service + @param organism: The short name of the organism to look within (eg: D. melanogaster) + @type organism: str + @param feature_types: The types of features to look for + @type feature_types: list[str] + @param regions: The regions to search within, in chrX:start..end or chrX\tstart\tend format + @type regions: list(str) + @param extension: A number of base-pairs to extend each region on either side (default: 0) + @type extension: int + @param is_interbase: Whether to interpret the co-ordinates as interbase co-ordinates + @type is_interbase: boolean + """ + self.service = service + self.organism = organism + self.feature_types = set(feature_types) + self.regions = set(regions) + self.extension = extension + self.is_interbase = is_interbase + self.bed_path = RegionQuery.BED_PATH + self.fasta_path = RegionQuery.FASTA_PATH + self.gff3_path = RegionQuery.GFF3_PATH + self.views = [] + self.root = _FakeRoot() + + def add_view(self, *args): + pass + + def _get_region_query(self): + return { + "organism": self.organism, + "featureTypes": list(self.feature_types), + "regions": list(self.regions), + "extension": self.extension, + "isInterbase": self.is_interbase + } + + def to_query_params(self): + """ + Returns the query parameters for this request. + ============================================== + This method is a required part of the interface for creating lists. + @rtype: dict + """ + return {"query": json.dumps(self._get_region_query())} + + def get_list_upload_uri(self): + """ + Returns the full url for the list upload service + ================================================ + This method is a required part of the interface for creating lists. + @rtype: str + """ + return self.service.root + RegionQuery.LIST_PATH + + @property + def query(self): + return self + + +class SequenceQuery(SequenceDataQuery): + """ + Class for querying InterMine Webservices for Sequence based data + ================================================================ + This module allows you to construct queries that retrieve data about sequences and + sequence features in biologically relevant formats. + The currently supported formats are UCSC-BED, GFF3, and FASTA. + """ + + def __init__(self, service_or_query, root=None): + """ + Constructor + =========== + >>> s = Service("www.flymine.org/query") + >>> bio_query = SequenceQuery(s, "Gene") + + >>> q = s.new_query("Gene").where(s.model.Gene.symbol == ["h", "r", "eve", "zen"]) + >>> bio_query = SequenceQuery(q) + + @param service_or_query: The service to connect to, or a query to wrap. + @type service_or_query: intermine.webservice.Service or intermine.query.Query + @param root: The root class of the query + @type root: str + """ + if isinstance(service_or_query, query.Query): + self.service = service_or_query.service + self.query = service_or_query + else: + self.service = service_or_query + self.query = query.Query(self.service.model, self.service, root=root) + + # Set up delegations + self.add_constraint = self.query.add_constraint + self.filter = self.where + + self.to_xml = self.query.to_xml + + self.get_logic = self.query.get_logic + self.set_logic = self.query.set_logic + + self.select_sequence = self.set_sequence + self.select_sequences = self.add_sequence_feature + self.add_sequence_features = self.add_sequence_feature + + def add_sequence_feature(self, *features): + """ + Add an arbitrarily long list of sequence features to the query. + =============================================================== + Fasta, GFF3 and BED queries all can read information from SequenceFeatures. + For Fasta you are advised to use the set_sequence method instead, + as unlike the GFF3 and BED services, the Fasta service can only handle + queries with one output column. + """ + for f in features: + p = self.query.column(f)._path + if p.is_attribute() or not p.get_class().isa("SequenceFeature"): + raise ValueError("%s is not a Sequence Feature" % (f)) + self.query.add_view(str(p) + ".id") + + return self + + def where(self, *args, **kwargs): + """ + Add a constraint to the query, and return self for chaining. + """ + self.query.where(*args, **kwargs) + return self + + def set_sequence(self, f): + """ + Set the sequence column to retrieve. + ==================================== + Add a sequence holding object to the query. It can be a SequenceFeature, Protein + or Sequence object. + Fasta queries, which read sequences rather than sequence features, + currently only permit one output column. + """ + self.query.views = [] + p = self.query.column(f)._path + if p.is_attribute() or not (p.get_class().isa("SequenceFeature") or + p.get_class().isa("Protein") or + p.get_class().isa("Sequence")): + raise ValueError("%s has no sequence information" % (f)) + self.query.add_view(str(p) + ".id") + + return self diff --git a/intermine/iterators.py b/intermine/iterators.py new file mode 100644 index 00000000..faad2f7b --- /dev/null +++ b/intermine/iterators.py @@ -0,0 +1,149 @@ +class BedIterator(object): + + BED_PATHS = {} + + def __init__(self, service, query, ucsc_compatible=True): + if service.root not in BedIterator.BED_PATHS: + BedIterator.BED_PATHS[service.root] = service.resolve_service_path("query.bed") + self.path = BedIterator.BED_PATHS[service.root] + self.service = service + self.query = query + self.ucsc_compatible = ucsc_compatible + self._header = [] + self.it = self._get_iter() + + def header(self): + return "\n".join(self._header) + + def _get_iter(self): + params = self.query.to_query_params() + if not self.ucsc_compatible: + params["ucscCompatible"] = "no" + try: + path = self.query.bed_path + except: + path = self.path + i = self.service.get_results(path, params, "tsv", self.query.views) + return i + + def __str__(self): + lines = [line for line in self] + return "\n".join(self._header + lines) + + def __iter__(self): + return self + + def next(self): + line = self.it.next() + while line and line.startswith("#") or line.startswith("track"): + self._header.append(line) + line = self.it.next() + if line: + return line + raise StopIteration + +class GFF3Iterator(object): + + GFF3_PATHS = {} + + def __init__(self, service, query): + if service.root not in GFF3Iterator.GFF3_PATHS: + GFF3Iterator.GFF3_PATHS[service.root] = service.resolve_service_path("query.gff3") + self.path = GFF3Iterator.GFF3_PATHS[service.root] + self.service = service + self.query = query + self._header = [] + self.it = self._get_iter() + + def header(self): + return "\n".join(self._header) + + def _get_iter(self): + params = self.query.to_query_params() + try: + path = self.query.gff3_path + except: + path = self.path + i = self.service.get_results(path, params, "tsv", self.query.views) + return i + + def __str__(self): + lines = [line for line in self] + return "\n".join(self._header + lines) + + def __iter__(self): + return self + + def next(self): + line = self.it.next() + while line and line.startswith("#"): + self._header.append(line) + line = self.it.next() + if line: + return line + raise StopIteration + +class FastaIterator(object): + + FASTA_PATHS = {} + + def __init__(self, service, query): + if service.root not in FastaIterator.FASTA_PATHS: + FastaIterator.FASTA_PATHS[service.root] = service.resolve_service_path("query.fasta") + self.path = FastaIterator.FASTA_PATHS[service.root] + self.service = service + self.query = query + self.it = self._get_iter() + self._holdover = None + + def _get_iter(self): + params = self.query.to_query_params() + try: + path = self.query.fasta_path + except: + path = self.path + i = self.service.get_results(path, params, "tsv", self.query.views) + return i + + def __str__(self): + records = [rec for rec in self] + return "\n".join(records) + + def __iter__(self): + return self + + def next(self): + lines = [] + if self.it is None: + raise StopIteration + + if self._holdover is not None: + lines.append(self._holdover) + self._holdover = None + else: + try: + lines.append(self.it.next()) + except StopIteration: + self.it = None + + try: + while self.it is not None: + line = self.it.next() + if line.startswith(">"): + self._holdover = line + break + lines.append(line) + except StopIteration: + self.it = None + + if len(lines): + return "\n".join(lines) + + if self._holdover: + ret = self._holdover + self._holdover = None + return self._holdover + + raise StopIteration + + diff --git a/tests/test_live.py b/tests/test_live.py new file mode 100644 index 00000000..2524a6ca --- /dev/null +++ b/tests/test_live.py @@ -0,0 +1,28 @@ +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + "/..")) + +from intermine.webservice import Service +from intermine import RegionQuery, SequenceQuery +s = Service("squirrel.flymine.org/flymine", token="C1o3t1e0d4V06ep8xb47DdlFVMr") +q = RegionQuery(s, "D. melanogaster", ["Exon", "Intron"], ["2L:14614843..14619614", "Foo"]) + +print (q.bed()) +print (q.fasta()) +print (q.gff3()) + +l = s.create_list(q) + +print (str(l)) + +sq = SequenceQuery(s, "Gene") + +sq.add_sequence_features("Gene").where("symbol", "ONE OF", ["eve", "zen", "r"]) + +print (sq.fasta()) + +sq.add_sequence_features("exons") + +print (sq.bed()) +print (sq.gff3()) +