loading spell expression data from file

yeastgenome · Sep 18, 2023 · 3d346e9 · 3d346e9
1 parent 427aea4
commit 3d346e9
Show file tree

Hide file tree

Showing 7 changed files with 409 additions and 0 deletions.
diff --git a/settings.gradle b/settings.gradle
@@ -48,3 +48,5 @@ project(':bio-source-sgd-complexes').projectDir = new File(settingsDir, './sgd-c
 project(':bio-source-sgd-db-utr').projectDir = new File(settingsDir, './sgd-db-utr')
 project(':bio-source-diopt-orthologs').projectDir = new File(settingsDir, './diopt-orthologs')
 project(':bio-source-sgd-complementation-db').projectDir = new File(settingsDir, './sgd-complementation-db')
+include ':bio-source-spell-expression-file'
+project(':bio-source-spell-expression-file').projectDir = new File(settingsDir, './spell-expression-file')
diff --git a/spell-expression-file/build.gradle b/spell-expression-file/build.gradle
@@ -0,0 +1,22 @@
+sourceSets {
+    main {
+        java {
+            srcDirs = ['src/main/java', 'build/gen']
+        }
+        resources {
+            srcDirs = ['src/main/resources']
+        }
+    }
+    test {
+        java {
+            srcDirs = ['src/test/java']
+        }
+        resources {
+            srcDirs = ['src/test/resources']
+        }
+    }
+}
+
+processResources {
+    from('.') { include ("*.properties")}
+}
diff --git a/spell-expression-file/spell-expression-file.properties b/spell-expression-file/spell-expression-file.properties
@@ -0,0 +1,3 @@
+have.file.custom.tgt = true
+converter.class = org.intermine.bio.dataconversion.SpellExpressionFileConverter
+
diff --git a/...ion-file/src/main/java/org/intermine/bio/dataconversion/SpellExpressionFileConverter.java b/...ion-file/src/main/java/org/intermine/bio/dataconversion/SpellExpressionFileConverter.java
@@ -0,0 +1,335 @@
+package org.intermine.bio.dataconversion;
+
+/*
+ * Copyright (C) 2002-2019 FlyMine
+ *
+ * This code may be freely distributed and modified under the
+ * terms of the GNU Lesser General Public Licence.  This should
+ * be distributed with the code.  See the LICENSE file for more
+ * information or http://www.gnu.org/copyleft/lesser.html.
+ *
+ */
+
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Iterator;
+import org.intermine.dataconversion.ItemWriter;
+import org.intermine.metadata.Model;
+import org.intermine.xml.full.Item;
+import org.apache.tools.ant.BuildException;
+import org.intermine.objectstore.ObjectStoreException;
+import org.intermine.util.FormattedTextParser;
+import org.apache.log4j.Logger;
+import java.util.ArrayList;
+import java.util.Map;
+
+/**
+ * 
+ * @author
+ */
+public class SpellExpressionFileConverter extends BioFileConverter {
+    //
+    protected static final Logger LOG = Logger.getLogger(SpellExpressionFileConverter.class);
+    private static final String DATASET_TITLE = "Spell Expression Data Dumped from MySQL";
+    private static final String DATA_SOURCE_NAME = "Spell";
+    private Map<String, Item> genes = new HashMap();
+    private Map<String, Item> datasets = new HashMap();
+    private Map<String, Item> conditions = new HashMap();
+    private Map<String, Item> tags = new HashMap();
+    private ArrayList<String> filenames = new ArrayList();
+    private static final String TAXON_ID = "4932";
+    private Item organism;
+
+    /**
+     * Constructor
+     *
+     * @param writer the ItemWriter used to handle the resultant items
+     * @param model  the Model
+     */
+    public SpellExpressionFileConverter(ItemWriter writer, Model model) throws ObjectStoreException  {
+        super(writer, model, DATA_SOURCE_NAME, DATASET_TITLE);
+        organism = createItem("Organism");
+        organism.setAttribute("taxonId", TAXON_ID);
+        organism.setAttribute("genus", "Saccharomyces");
+        organism.setAttribute("species", "cerevisiae");
+        organism.setAttribute("name", "Saccharomyces cerevisiae");
+        organism.setAttribute("shortName", "S. cerevisiae");
+        store(organism);
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    public void process(Reader reader) throws Exception {
+        processDataSets(reader); //process each file
+        storeDataSets();
+        storeGenes();
+    }
+
+
+    private void processDataSets(Reader preader) throws Exception, ObjectStoreException {
+
+        int count = 0;
+        String prevFileNo = "";
+        System.out.println("Processing DataSet No. ...");
+
+        Iterator<?> tsvIter;
+        try {
+            tsvIter = FormattedTextParser.parseTabDelimitedReader(preader);
+        } catch (Exception e) {
+            throw new BuildException("cannot parse file: " + getCurrentFile(), e);
+        }
+
+        while (tsvIter.hasNext()) {
+
+            String[] line = (String[]) tsvIter.next();
+
+            if (line.length < 17) {
+                LOG.error("Couldn't process line. Expected 17 cols, but was " + line.length);
+                continue;
+            }
+            count++;
+
+            String geneName = line[0].replace("[convertDB]", "").trim();
+            String pubmedID = line[1].trim();
+            String fileName = line[2].trim();
+            String geoID = line[3].trim();
+            String platformID = line[4].trim();
+            String channelCount = line[5].trim();
+            String datasetName = line[6].trim();
+            String description = line[7].trim();
+            String numCond = line[8].trim();
+            String numGenes = line[9].trim();
+            String author = line[10].trim();
+            String allAuthors = line[11].trim();
+            String title = line[12].trim();
+            String journal = line[13].trim();
+            String pubYear = line[14].trim();
+            String condDesc = line[15].trim();
+            String tags = line[16].trim();
+            String dataTable = line[17].trim();
+
+            prevFileNo = fileName;
+            if(!fileName.equalsIgnoreCase(prevFileNo)) {
+                conditions = new HashMap();  //reset conditions for each dataset - filename change
+            }
+            //System.out.println("gene: "+geneName + "    condDesc: "+ condDesc + " data_table: " + dataTable);
+
+            // create gene first time you see it
+            Item gene = getGene(geneName);
+
+            // create dataset first time you see it
+            Item dataset = getDataSet(pubmedID, fileName, geoID, platformID,
+                    channelCount, datasetName, description, numCond, numGenes,
+                    author, allAuthors, title, journal, pubYear, tags);
+
+            // add score to gene - using the condition and dataset info
+            getConditionScore(dataset, condDesc, dataTable, geneName);
+
+        }
+    }
+
+    /**
+     * @param geneName
+     * @return
+     * @throws ObjectStoreException
+     */
+
+    private Item getGene(String geneName) throws ObjectStoreException {
+
+        Item item = genes.get(geneName);
+
+        if (item == null) {
+            item = createItem("Gene");
+            item.setAttribute("secondaryIdentifier", geneName);
+            item.setReference("organism", organism.getIdentifier());
+            genes.put(geneName, item);
+        }
+
+        return item;
+    }
+
+    /**
+     * @param pubmedID
+     * @param fileName
+     * @param geoID
+     * @param platformID
+     * @param channelCount
+     * @param datasetName
+     * @param description
+     * @param numCond
+     * @param numGenes
+     * @param author
+     * @param allAuthors
+     * @param title
+     * @param journal
+     * @param publicationYear
+     * @param tags
+     * @return
+     * @throws ObjectStoreException
+     */
+
+    private Item getDataSet(String pubmedID, String fileName, String geoID,
+                            String platformID, String channelCount, String datasetName,
+                            String description, String numCond, String numGenes, String author,
+                            String allAuthors, String title, String journal,
+                            String publicationYear, String kwtags) throws ObjectStoreException {
+
+        Item item = datasets.get(fileName);
+
+        if (item == null) {
+
+            item = createItem("ExpressionDataSet");
+
+            item.setAttribute("pubmedID", pubmedID);
+            item.setAttribute("fileName", fileName);
+            item.setAttribute("geoID", geoID);
+            item.setAttribute("platformID", platformID);
+            item.setAttribute("channelCount", channelCount);
+            item.setAttribute("name", datasetName);
+            item.setAttribute("description", description);
+            item.setAttribute("numConds", numCond);
+            item.setAttribute("numGenes", numGenes);
+            item.setAttribute("author", author);
+            item.setAttribute("allAuthors", allAuthors);
+            item.setAttribute("title", title);
+            item.setAttribute("journal", journal);
+            item.setAttribute("publicationYear", publicationYear);
+            //item.setAttribute("tags", tags); store keywords as collection on spelldataset to make it querieable
+
+            if (kwtags.contains("|")) {
+
+                String[] keywords = kwtags.split("\\|");
+
+                for (int i = 0; i < keywords.length; i++) {
+                    String kw = keywords[i];
+
+                    Item dtag = tags.get(kw);
+
+                    if (dtag == null) {
+
+                        dtag = createItem("ExpressionDataSetTag");
+                        dtag.setAttribute("tagname", kw);
+                        try {
+                            store(dtag);
+                        } catch (ObjectStoreException e) {
+                            throw new ObjectStoreException(e);
+                        }
+                        tags.put(kw, dtag);
+                    }
+                    item.addToCollection("expressiondatasettags", dtag.getIdentifier());
+
+                }
+
+            } else {
+                Item dtag = tags.get(kwtags);
+
+                if (dtag == null) {
+                    dtag = createItem("ExpressionDataSetTag");
+                    dtag.setAttribute("tagname", kwtags);
+                    try {
+                        store(dtag);
+                    } catch (ObjectStoreException e) {
+                        throw new ObjectStoreException(e);
+                    }
+                    tags.put(kwtags, dtag);
+                }
+                item.addToCollection("expressiondatasettags", dtag.getIdentifier());
+
+            }
+            //System.out.println("pubmed: "+pubmedID + "    geoID: "+ geoID + " datasetName: " + datasetName);
+            datasets.put(fileName, item);
+        }
+
+        return item;
+    }
+
+    /**
+     * @param dataSet
+     * @param condDesc
+     * @param dataTable
+     * @param geneName
+     * @throws ObjectStoreException
+     */
+    private void getConditionScore(Item dataSet, String condDesc,
+                                   String dataTable, String geneName) throws ObjectStoreException {
+
+        Item gene = genes.get(geneName);
+        String newconds = condDesc.replaceAll("~", "|");
+
+        String[] expconditions = newconds.split("\\|");
+        String[] scores = dataTable.split(",");
+
+
+        for (int i = 0; i < scores.length; i++) {
+
+            String cond = expconditions[i];
+            String condscore = scores[i];
+
+            Item dcond = conditions.get(cond);
+            if (dcond == null) {
+
+                dcond = createItem("ExpressionCondition");
+                dcond.setAttribute("conditionname", cond);
+                dcond.setAttribute("ordernumber", String.valueOf(i));
+                dcond.setReference("expressiondataset", dataSet.getIdentifier());
+                try {
+                    store(dcond);
+                } catch (ObjectStoreException e) {
+                    throw new ObjectStoreException(e);
+                }
+                conditions.put(cond, dcond);
+            }
+
+            dataSet.addToCollection("expressionconditions", dcond.getIdentifier());
+
+            // tie up the score with gene
+            Item score = createItem("ExpressionScore");
+            if (!condscore.equals("NA")) {
+                score.setAttribute("score", condscore);
+            }
+            score.setReference("expressioncondition", dcond.getIdentifier());
+
+            try {
+                store(score);
+            } catch (ObjectStoreException e) {
+                throw new ObjectStoreException(e);
+            }
+
+            gene.addToCollection("expressionScores", score.getIdentifier());
+
+        }
+
+    }
+
+    /**
+     * @throws ObjectStoreException
+     */
+
+    private void storeGenes() throws ObjectStoreException {
+        for (Item gene : genes.values()) {
+            try {
+                store(gene);
+            } catch (ObjectStoreException e) {
+                throw new ObjectStoreException(e);
+            }
+        }
+    }
+
+    /**
+     * @throws ObjectStoreException
+     */
+
+    private void storeDataSets() throws ObjectStoreException {
+        for (Item dataset : datasets.values()) {
+            try {
+                store(dataset);
+            } catch (ObjectStoreException e) {
+                throw new ObjectStoreException(e);
+            }
+        }
+    }
+
+}
+
+
diff --git a/spell-expression-file/src/main/resources/spell-expression-file_additions.xml b/spell-expression-file/src/main/resources/spell-expression-file_additions.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+<!-- This file details any extensions needed to the data model to store data from this source, everything else is automatically generated from the model description so this is all we need to do to add to the model. -->
+
+<classes>
+<!-- add any <class> elements here -->
+
+</classes>
diff --git a/spell-expression-file/src/main/resources/spell-expression-file_keys.properties b/spell-expression-file/src/main/resources/spell-expression-file_keys.properties
@@ -0,0 +1,6 @@
+DataSet.key_name = name
+DataSource.key_name = name
+SOTerm.key_name_ontology = name, ontology
+Organism.key_taxonid = taxonId
+Ontology.key_name = name
+Publication.key_pubmedid = pubMedId
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		have.file.custom.tgt = true
		converter.class = org.intermine.bio.dataconversion.SpellExpressionFileConverter