From 0017181392b45ad0b924298aad88adfbe55d71f9 Mon Sep 17 00:00:00 2001 From: sweng66 Date: Thu, 8 Jul 2021 09:49:22 -0700 Subject: [PATCH] adding Go perl lib --- .../GO/AnnotationProvider/AnnotationParser.pm | 2097 ++++++++++++ www/lib/GO/AnnotationProvider/MetaData.pm | 1539 +++++++++ www/lib/GO/OntologyProvider/OboParser.pm | 1042 ++++++ www/lib/GO/README | 1 + www/lib/GO/TermFinder.pm | 2134 +++++++++++++ www/lib/GO/TermFinderReport/Html.pm | 419 +++ www/lib/GO/TermFinderReport/Text.pm | 283 ++ www/lib/GO/Utils/General.pm | 112 + www/lib/GO/View.pm | 2809 +++++++++++++++++ 9 files changed, 10436 insertions(+) create mode 100644 www/lib/GO/AnnotationProvider/AnnotationParser.pm create mode 100644 www/lib/GO/AnnotationProvider/MetaData.pm create mode 100644 www/lib/GO/OntologyProvider/OboParser.pm create mode 100644 www/lib/GO/README create mode 100644 www/lib/GO/TermFinder.pm create mode 100644 www/lib/GO/TermFinderReport/Html.pm create mode 100644 www/lib/GO/TermFinderReport/Text.pm create mode 100644 www/lib/GO/Utils/General.pm create mode 100644 www/lib/GO/View.pm diff --git a/www/lib/GO/AnnotationProvider/AnnotationParser.pm b/www/lib/GO/AnnotationProvider/AnnotationParser.pm new file mode 100644 index 0000000..cf53f0f --- /dev/null +++ b/www/lib/GO/AnnotationProvider/AnnotationParser.pm @@ -0,0 +1,2097 @@ +package GO::AnnotationProvider::AnnotationParser; + +# File : AnnotationParser.pm +# Authors : Elizabeth Boyle; Gavin Sherlock +# Date Begun : Summer 2001 +# Rewritten : September 25th 2002 + +# $Id: AnnotationParser.pm,v 1.13 2015/02/09 18:36:55 mark Exp $ + +# Copyright (c) 2003 Gavin Sherlock; Stanford University + +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation files +# (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, +# publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: + +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +=pod + +=head1 NAME + +GO::AnnotationProvider::AnnotationParser - parses a gene annotation file + +=head1 SYNOPSIS + +GO::AnnotationProvider::AnnotationParser - reads a Gene Ontology gene +associations file, and provides methods by which to retrieve the GO +annotations for the an annotated entity. Note, it is case +insensitive, with some caveats - see documentation below. + + my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(annotationFile => "data/gene_association.sgd"); + + my $geneName = "AAT2"; + + print "GO associations for gene: ", join (" ", $annotationParser->goIdsByName(name => $geneName, + aspect => 'P')), "\n"; + + print "Database ID for gene: ", $annotationParser->databaseIdByName($geneName), "\n"; + + print "Database name: ", $annotationParser->databaseName(), "\n"; + + print "Standard name for gene: ", $annotationParser->standardNameByName($geneName), "\n"; + + my $i; + + my @geneNames = $annotationParser->allStandardNames(); + + foreach $i (0..10) { + + print "$geneNames[$i]\n"; + + } + +=head1 DESCRIPTION + +GO::AnnotationProvider::AnnotationParser is a concrete subclass of +GO::AnnotationProvider, and creates a data structure mapping gene +names to GO annotations by parsing a file of annotations provided by +the Gene Ontology Consortium. + +This package provides object methods for retrieving GO annotations +that have been parsed from a 'gene associations' file, provided by +the gene ontology consortium. The format for the file is: + +Lines beginning with a '!' character are comment lines. + + Column Cardinality Contents + ------ ----------- ------------------------------------------------------------- + 0 1 Database abbreviation for the source of annotation (e.g. SGD) + 1 1 Database identifier of the annotated entity + 2 1 Standard name of the annotated entity + 3 0,1 NOT (if a gene is specifically NOT annotated to the term) + 4 1 GOID of the annotation + 5 1,n Reference(s) for the annotation + 6 1 Evidence code for the annotation + 7 0,n With or From (a bit mysterious) + 8 1 Aspect of the Annotation (C, F, P) + 9 0,1 Name of the product being annotated + 10 0,n Alias(es) of the annotated product + 11 1 type of annotated entity (one of gene, transcript, protein) + 12 1,2 taxonomic id of the organism encoding and/or using the product + 13 1 Date of annotation YYYYMMDD + 14 1 Assigned_by : The database which made the annotation + +Columns are separated by tabs. For those entries with a cardinality +greater than 1, multiple entries are pipe , |, delimited. + +Further details can be found at: + +http://www.geneontology.org/doc/GO.annotation.html#file + +The following assumptions about the file are made (and should be true): + + 1. All aliases appear for all entries of a given annotated product + 2. The database identifiers are unique, in that two different + entities cannot have the same database id. + +=head1 TODO + +Also see the TODO list in the parent, GO::AnnotationProvider. + + 1. Add in methods that will allow retrieval of evidence codes with + the annotations for a particular entity. + + 2. Add in methods that return all the annotated entities for a + particular GOID. + + 3. Add in the ability to request only annotations either including + or excluding particular evidence codes. Such evidence codes + could be provided as an anonymous array as the value of a named + argument. + + 4. Same as number 3, except allow the retrieval of annotated + entities for a particular GOID, based on inclusion or exclusion + of certain evidence codes. + + These first four items will require a reworking of how data are + stored on the backend, and thus the parsing code itself, though it + should not affect any of the already existing API. + + 5. Instead of 'use'ing Storable, 'require' it instead, only at the + point of use, which will mean that AnnotationParser can be + happily used in the absence of Storable, just without those + functions that need it. + + 6. Extend the ValidateFile class method to check that an entity + should never be annotated to the same node twice, with the same + evidence, with the same reference. + + 7. An additional checker, that uses an AnnotationProvider in + conjunction with an OntologyProvider, would be useful, that + checks that some of the annotations themselves are valid, ie + that no entities are annotated to the 'unknown' node in a + particular aspect, and also to another node within that same + aspect. Can annotations be redundant? ie, if an entity is + annotated to a node, and an ancestor of the node, is that + annotation redundant? Does it depend on the evidence codes and + references. Or are such annotations reinforcing? These things + are useful to consider when formulating the confidence which can + be attributed to an annotation. + +=cut + +use strict; +use warnings; +use diagnostics; + +use Storable qw (nstore); +use IO::File; + +use vars qw (@ISA $PACKAGE $VERSION); + +use GO::AnnotationProvider; +@ISA = qw (GO::AnnotationProvider); + +$PACKAGE = "GO::AnnotationProvider::AnnotationParser"; +$VERSION = "0.15"; + +# CLASS Attributes +# +# These should be considered as constants, and are initialized here + +my $DEBUG = 0; + +# constants for instance attribute name + +my $kDatabaseName = $PACKAGE.'::__databaseName'; # stores the name of the annotating database +my $kFileName = $PACKAGE.'::__fileName'; # stores the name of the file used to instantiate the object +my $kNameToIdMapInsensitive = $PACKAGE.'::__nameToIdMapInsensitive'; # stores a case insensitive map of all unambiguous names for a gene to the database id +my $kNameToIdMapSensitive = $PACKAGE.'::__nameToIdMapSensitive'; # stores a case sensitive map of all names where a particular casing is unambiguous for a gene to the database id +my $kAmbiguousNames = $PACKAGE.'::__ambiguousNames'; # stores the database id's for all ambiguous names +my $kIdToStandardName = $PACKAGE.'::__idToStandardName'; # stores a map of database id's to standard names of all entities +my $kStandardNameToId = $PACKAGE.'::__StandardNameToId'; # stores a map of standard names to their database id's +my $kUcIdToId = $PACKAGE.'::__ucIdToId'; # stores a map of uppercased databaseIds to the databaseId +my $kUcStdNameToStdName = $PACKAGE.'::__ucStdNameToStdName'; # stores a map of uppercased standard names to the standard name +my $kNameToCount = $PACKAGE.'::__nameToCount'; # stores a case sensitive map of the number of times a name has been seen +my $kGoids = $PACKAGE.'::__goids'; # stores all the goid annotations +my $kNumAnnotatedGenes = $PACKAGE.'::__numAnnotatedGenes'; # stores number of genes with annotations, per aspect + +my $kAmbiguousNamesSensitive = $PACKAGE.'::__ambiguousNamesSensitive'; # names (case sensitive) that are ambiguous + +my $kTotalNumAnnotatedGenes = $PACKAGE.'::__totalNumAnnotatedGenes'; # total number of annotated genes + +# added to keep track of number of each evidence code - mark +my $kEvidenceCodeCount = $PACKAGE.'::__evidenceCodeCount'; +my $kCVSVersion = $PACKAGE."::__cvsVersion"; # mark +my $kValidationDate = $PACKAGE."::__validationDate"; # mark + +# constants to describe what is in which column in the annotation file + +my $kDatabaseNameColumn = 0; +my $kDatabaseIdColumn = 1; +my $kStandardNameColumn = 2; +my $kNotColumn = 3; +my $kGoidColumn = 4; +my $kReferenceColumn = 5; +my $kEvidenceColumn = 6; +my $kWithColumn = 7; +my $kAspectColumn = 8; +my $kNameColumn = 9; +my $kAliasesColumn = 10; +my $kEntityTypeColumn = 11; +my $kTaxonomicIDColumn = 12; +my $kDateColumn = 13; +my $kAssignedByColumn = 14; + +# the following hash of anonymous arrays indicates for each column +# what the maximum and minimum number of entries per column can be. +# If no maximum is indicated, then the maximum is equal to the +# minimum, and exactly that number of entries must exist. + +my %kColumnsToCardinality = ($kDatabaseNameColumn => [1 ], + $kDatabaseIdColumn => [1 ], + $kStandardNameColumn => [1 ], + $kNotColumn => [0, 1], + $kGoidColumn => [1 ], + $kReferenceColumn => [1, "n"], + $kEvidenceColumn => [1 ], + $kWithColumn => [0, "n"], + $kAspectColumn => [1 ], + $kNameColumn => [0, 1], + $kAliasesColumn => [0, "n"], + $kEntityTypeColumn => [1 ], + $kTaxonomicIDColumn => [1, 2], + $kDateColumn => [1 ], + $kAssignedByColumn => [1 ]); + +my $kNumColumnsInFile = scalar keys %kColumnsToCardinality; + +=pod + +=head1 Class Methods + +=cut + +############################################################################ +sub Usage{ +############################################################################ +=pod + +=head2 Usage + +This class method simply prints out a usage statement, along with an +error message, if one was passed in. + +Usage : + + GO::AnnotationProvider::AnnotationParser->Usage(); + +=cut + + my ($class, $message) = @_; + + defined $message && print $message."\n\n"; + + print 'The constructor expects one of two arguments, either a +\'annotationFile\' argument, or and \'objectFile\' argument. When +instantiated with an annotationFile argument, it expects it to +correspond to an annotation file created by one of the GO consortium +members, according to their file format. When instantiated with an +objectFile argument, it expects to open a previously created +annotationParser object that has been serialized to disk (see the +serializeToDisk method). + +Usage: + + my $annotationParser = '.$PACKAGE.'->new(annotationFile => $file); + + my $annotationParser = '.$PACKAGE.'->new(objectFile => $file); +'; + +} + +############################################################################ +sub ValidateFile{ +############################################################################ +=pod + +=head2 ValidateFile + +This class method reads an annotation file, and returns a reference to +an array of errors that are present within the file. The errors are +simply strings, each beginning with "Line $lineNo : " where $lineNo is +the number of the line in the file where the error was found. + +Usage: + + my $errorsRef = GO::AnnotationProvider::AnnotationParser->ValidateFile(annotationFile => $file); + +=cut + + my ($class, %args) = @_; + + my $file = $args{'annotationFile'} || $class->_handleMissingArgument(argument => 'annotationFile'); + + my $annotationsFh = IO::File->new($file, q{<} )|| die "$PACKAGE cannot open $file : $!"; + + my (@errors, @line); + + my ($databaseId, $standardName, $aliases); + my (%idToName, %idToAliases); + + my $lineNo = 0; + + while (<$annotationsFh>){ + + ++$lineNo; + + next if $_ =~ m/^!/; # skip comment lines + + chomp; + + next unless $_; # skip an empty line, if there is one + + @line = split("\t", $_, -1); + + if (scalar @line != $kNumColumnsInFile){ # doesn't have the correct number of columns + + push (@errors, "Line $lineNo has ". scalar @line. "columns, instead of $kNumColumnsInFile."); + + } + + $class->__CheckCardinalityOfColumns(\@errors, \@line, $lineNo); + + # now want to deal with sanity checks... + + ($databaseId, $standardName, $aliases) = @line[$kDatabaseIdColumn, $kStandardNameColumn, $kAliasesColumn]; + + next if ($databaseId eq ""); # will have given incorrect cardinality, but nothing more we can do with it + + if (!exists $idToName{$databaseId}){ + + $idToName{$databaseId} = $standardName; + + }elsif ($idToName{$databaseId} ne $standardName){ + + push (@errors, "Line $lineNo : $databaseId has more than one standard name : $idToName{$databaseId} and $standardName."); + + } + + if (!exists $idToAliases{$databaseId}){ + + $idToAliases{$databaseId} = $aliases; + + }elsif($idToAliases{$databaseId} ne $aliases){ + + push (@errors, "Line $lineNo : $databaseId has more than one collections of aliases : $idToAliases{$databaseId} and $aliases."); + + } + + } + + $annotationsFh->close || die "$PACKAGE cannot close $file : $!"; + + return \@errors; + +} + +############################################################################ +sub __CheckCardinalityOfColumns{ +############################################################################ +# This method checks the cardinality of each column on a line +# +# Usage: +# +# $class->__CheckCardinalityOfColumns(\@errors, \@line, $lineNo); + + my ($class, $errorsRef, $lineRef, $lineNo) = @_; + + my ($cardinality, $min, $max); + + foreach my $column (sort {$a<=>$b} keys %kColumnsToCardinality){ + + ($min, $max) = @{$kColumnsToCardinality{$column}}[0,1]; + + $cardinality = $class->__GetCardinality($lineRef->[$column], $errorsRef, $lineNo); + + if (!defined $max){ # must have a defined number of entries + + if ($cardinality != $min){ + + push (@{$errorsRef}, "Line $lineNo : column $column has a cardinality of $cardinality, instead of $min."); + + } + + }else{ # there's a range of allowed number of entries + + if ($cardinality < $min){ # check if less than minimum + + push (@{$errorsRef}, "Line $lineNo : column $column has a cardinality of $cardinality, which is less than the required $min."); + + }elsif ($kColumnsToCardinality{$column}->[1] ne 'n' && + $cardinality > $max){ # check if more than maximum + + push (@{$errorsRef}, "Line $lineNo : column $column has a cardinality of $cardinality, which is more than the allowed $max."); + + } + + } + + } + +} + +############################################################################ +sub __GetCardinality{ +############################################################################ +# This private method returns an integer that indicates the +# cardinality of a text string, where multiple entries are assumed to +# be seperated by the pipe character (|). In addition, it checks +# whether there are null or whitespace only entries. +# +# Usage: +# +# my $cardinality = $class->__GetCardinality($string); + + my ($class, $string, $errorsRef, $lineNo) = @_; + + my $cardinality; + + if (!defined $string || $string eq ""){ + + $cardinality = 0; + + }else{ + + my @entries = split(/\|/, $string, -1); + + foreach my $entry (@entries){ + + if (!defined $entry){ + + push (@{$errorsRef}, "Line $lineNo : There is an undefined value in the string $string."); + + }elsif ($entry =~ /^\s+$/){ + + push (@{$errorsRef}, "Line $lineNo : There is a white-space only value in the string $string."); + + } + + } + + $cardinality = scalar @entries; + + } + + return $cardinality; + +} + +############################################################################ +# +# Constructor, and initialization methods. +# +# All initialization methods are private, except, of course, for the +# new() method. +# +############################################################################ + +############################################################################ +sub new{ +############################################################################ +=pod + +=head1 Constructor + +=head2 new + +This is the constructor for an AnnotationParser object. + +The constructor expects one of two arguments, either a +'annotationFile' argument, or and 'objectFile' argument. When +instantiated with an annotationFile argument, it expects it to +correspond to an annotation file created by one of the GO consortium +members, according to their file format. When instantiated with an +objectFile argument, it expects to open a previously created +annotationParser object that has been serialized to disk (see the +serializeToDisk method). + +Usage: + + my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(annotationFile => $file); + + my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(objectFile => $file); + +=cut + + + my ($class, %args) = @_; + + my $self; + + if (exists($args{'annotationFile'})){ + + $self = {}; + + bless $self, $class; + +# $self->__init($args{'annotationFile'}); +# if (!$self->__init($args{'annotationFile'}, $args{'evidenceCodes'})) { return undef; } # mark + if (!$self->__init(%args)) { return undef; } # mark + + }elsif (exists($args{'objectFile'})){ + + $self = Storable::retrieve($args{'objectFile'}) || die "Could not instantiate $PACKAGE object from objectFile : $!"; + + $self->__setFile($args{'objectFile'}); + + }else{ + + $class->Usage("An annotationFile or objectFile argument must be provided."); + die; + + } + + # now, we have to make some alteration to some hashes to support + # our API for case insensitivity. The API says that if a name is + # supplied that would otherwise be ambiguous, but has a unique + # casing, then we will accept it as that unique cased version. + # Thus, we need to make sure that our $kNameToIdMapSensitive hash + # only tracks those names that were unique in a particular case + + foreach my $name (keys %{$self->{$kNameToCount}}){ + + # go through the has that has a count of each name + + if ($self->{$kNameToCount}{$name} > 1 || exists $self->{$kNameToIdMapInsensitive}{uc($name)}){ + + # if it was seen more than once, or is known to be unique + # in a case insensitive fashion, then delete it. This + # will leave just those that are unique in a case + # sensitive fashion + + delete $self->{$kNameToIdMapSensitive}{$name}; + + } + + } + + return ($self); + +} + +############################################################################ +sub __init{ +############################################################################ +# This private method initializes the object by reading in the data +# from the annotation file. +# +# Usage : +# +# $self->__init($file); +# + + my ($self, %args) = @_; + + my $file = $args{annotationFile}; + my $evidenceCodes = $args{evidenceCodes}; + my $evidenceCodesNot = $args{evidenceCodesNot}; + my $aspectIn = $args{aspect}; + + $self->__setFile($file); + + my $annotationsFh = IO::File->new($file, q{<} )|| die "$PACKAGE cannot open $file : $!"; + + # now read through annotations file + + my (@line, $databaseId, $goid, $aspect, $standardName, $aliases); + + while (<$annotationsFh>){ + + my $in = $_; + + if ($in =~ m/^!(.*)\s+\$$/) { + my @info = split(/\s*\:\s*/, $1); + if ($#info > 0) { + if (($info[0] eq "CVS Version") && ($info[1] eq "Revision") && ($info[2] =~ /^\s*([\d\.]+)(\s|$)/)) { + $self->{$kCVSVersion} = $1; + } elsif (($info[0] eq "GOC Validation Date") && ($info[1] =~ /^\s*([\d\/]+)(\s|$)/)) { + $self->{$kValidationDate} = $1; + } + } + } + next if $in =~ m/^!/; # skip commented lines + + chomp($in); + + next unless $in; # skip an empty line, if there is one + + @line = split("\t", $in, -1); + + next if uc($line[$kNotColumn]) eq 'NOT'; # skip annotations NOT to a GOID + + ($databaseId, $goid, $aspect) = @line[$kDatabaseIdColumn, $kGoidColumn, $kAspectColumn]; + ($standardName, $aliases) = @line[$kStandardNameColumn, $kAliasesColumn]; + + if ($databaseId eq ""){ + + print "On line $. there is a missing databaseId, so it will be ignored.\n"; + next; + + } + + # EXPERIMENTAL + if (($aspectIn) && ($aspect ne $aspectIn)) { + next; + } + + # if filtering on evidence code, ignore those that don't match the list - mark + # we can use one of two modes, include only given codes or exclude all given codes + my $evidenceCode = $line[$kEvidenceColumn]; + $self->{$kEvidenceCodeCount}->{FOUND}->{$evidenceCode}++; + if ($evidenceCodes) { + unless ($evidenceCodes->{$evidenceCode}) { + $self->{$kEvidenceCodeCount}->{SKIPPED}->{$evidenceCode}++; + next; + } + } elsif ($evidenceCodesNot) { + if ($evidenceCodesNot->{$evidenceCode}) { + $self->{$kEvidenceCodeCount}->{SKIPPED}->{$evidenceCode}++; + next; + } + } + $self->{$kEvidenceCodeCount}->{USING}->{$evidenceCode}++; + $self->{$kEvidenceCodeCount}->{TOTAL_USING}++; + + # record the source of the annotation + + $self->{$kDatabaseName} = $line[$kDatabaseNameColumn] if (!exists($self->{$kDatabaseName})); + + # now map the standard name and all aliases to the database id + +# $self->__mapNamesToDatabaseId($databaseId, $standardName, $aliases); + if (!$self->__mapNamesToDatabaseId($databaseId, $standardName, $aliases)) { return 0; }; # mark + + # and store the GOID + + $self->__storeGOID($databaseId, $goid, $aspect); + + } + + $annotationsFh->close || die "AnnotationParser can't close $file: $!"; + + # now count up how many annotated things we have + + foreach my $databaseId (keys %{$self->{$kGoids}}){ + + $self->{$kTotalNumAnnotatedGenes}++; + + foreach my $aspect (keys %{$self->{$kGoids}{$databaseId}}){ + + $self->{$kNumAnnotatedGenes}{$aspect}++; + + } + + } + + return 1; # mark +} + + +sub version { + my $self = shift; + return $self->{$kCVSVersion}; +} + + +sub validationDate { + my $self = shift; + return $self->{$kValidationDate}; +} + + +sub evidenceCodeCounts { + my ($self, $code) = @_; + + my ($found, $skipped, $using) = ($self->{$kEvidenceCodeCount}->{FOUND}->{$code}, $self->{$kEvidenceCodeCount}->{SKIPPED}->{$code}, $self->{$kEvidenceCodeCount}->{USING}->{$code}); + $found = 0 if (!$found); + $skipped = 0 if (!$skipped); + $using = 0 if (!$using); + return ($found, $skipped, $using); +} + + +sub evidenceCodesUsed { + my $self = shift; + return $self->{$kEvidenceCodeCount}->{USING}; +} +sub evidenceCodesFound { + my $self = shift; + return $self->{$kEvidenceCodeCount}->{FOUND}; +} +sub evidenceCodesSkipped { + my $self = shift; + return $self->{$kEvidenceCodeCount}->{SKIPPED}; +} + +sub totalEvidenceCodesUsed { + my $self = shift; + return $self->{$kEvidenceCodeCount}->{TOTAL_USING}; +} + + +############################################################################ +sub __setFile{ +############################################################################ +# This method sets the name of the file used for construction. +# +# Usage: +# +# $self->__setFile($file); +# + + my ($self, $file) = @_; + + $self->{$kFileName} = $file; + +} + +############################################################################ +sub __mapNamesToDatabaseId{ +############################################################################ +# This private method maps all names and aliases to the databaseId of +# an entity. It also maps the databaseId to itself, to facilitate a +# single way of mapping any identifier to the database id. +# +# This mapping is done so that it can be queried in a case insensitive +# fashion, and thus allow clients to be able to retrieve annotations +# without necessarily knowing the correct casing of any particular +# identifier. +# +# We have to keep the following considerations in mind: +# +# 1. Any identifier may be non-unique with respect to casing, that is, +# it is possible that there is ABC1 and abc1 +# +# 2. We want to be able to returns names and identifiers in their correct +# casing, irrespective of the casing that is provided in the query +# +# 3. In the situation when a name that is ambiguous when considered case +# insensitively is provided, we should check to see whether that casing +# corresponds to a know correct casing, and assume that that is the one +# that they meant. +# +# Usage : +# +# $self->__mapNamesToDatabaseId($databaseId, $standardName, $aliases); +# +# where $aliases is a pipe-delimited list of aliases + + my ($self, $databaseId, $standardName, $aliases) = @_; + + if (exists $self->{$kIdToStandardName}{$databaseId}){ # we've already seen this databaseId + + if ($self->{$kIdToStandardName}{$databaseId} ne $standardName){ + + # there is a problem in the file - there should only be + # one standard name for a given database id, so we'll die + # here + +# die "databaseId $databaseId maps to more than one standard name : $self->{$kIdToStandardName}{$databaseId} ; $standardName\n"; + # this does sometimes happen, and we would like to handle it - mark + print STDERR "WARNING: databaseId $databaseId maps to more than one standard name : $self->{$kIdToStandardName}{$databaseId} ; $standardName\n"; + return 0; + + }else{ + + # we can simply return, as we've already processed + # information for this databaseId + + return 1; + + } + + } + + # we haven't see this databaseId before, so process the data + + my @aliases = split(/\|/, $aliases); + + my %seen; # sometimes an alias will be the same as the standard name + + foreach my $name ($databaseId, $standardName, @aliases){ + + # here, we simply store, in case sensitive fashion, a mapping + # of the name to databaseId. Later, this map will be + # modified, so it only contains those names where the case + # sensitive version is unique. We need this map to fulfill + # the API requirements that if databaseIdByName() is called + # with a name that is ambiguous, but the casing is unique, + # then it will correctly determine the casing match + + $self->{$kNameToIdMapSensitive}{$name} = $databaseId; + + my $ucName = uc($name); # cache uppercased version for efficiency + + # occasionally, a standard name is also listed in the aliases, + # so we will skip the name if we've already seen it. + + # note that for now, we are doing this case sensitively - it + # is possible that a gene is referred to by the same name + # twice but with different casing - however, if those are the + # only times that those particular versions are seen, then + # they will still be treated unambiguously. + + next if exists ($seen{$name}); + + # let's keep a count of every time a name with the same casing + # is seen, across all genes + + $self->{$kNameToCount}{$name}++; + + # now we have to deal with the name, depending on whether we + # newly determine it is ambiguous, whether we already know + # that name is ambiguous, or whether (so far) the name appears + # to be unique + + # for something to be newly ambiguous, the case insensitive + # version of its name must have been seen associated with some + # other database id already. + + # if the case insensitive version of the name has already been + # seen with the same database id, it is still not ambiguous + + if (exists $self->{$kNameToIdMapInsensitive}{$ucName} && $self->{$kNameToIdMapInsensitive}{$ucName} ne $databaseId){ + + # so record what it maps to + + # current databaseId + + push (@{$self->{$kAmbiguousNames}{$ucName}}, $databaseId); + + # and previously seen databaseId + + push (@{$self->{$kAmbiguousNames}{$ucName}}, $self->{$kNameToIdMapInsensitive}{$ucName}); + + # and now delete the previously seen databaseId from the unambiguous mapping + + delete $self->{$kNameToIdMapInsensitive}{$ucName}; + + }elsif (exists $self->{$kAmbiguousNames}{$ucName}){ # we already know it's ambiguous + + # so add in this new databaseId + + push (@{$self->{$kAmbiguousNames}{$ucName}}, $databaseId); + + }else{ # otherwise simply map it unambiguously for now, as we haven't see the name before + + $self->{$kNameToIdMapInsensitive}{$ucName} = $databaseId; + + } + + $seen{$name} = undef; # remember that we've seen the name for this row + + } + + # now we need to record some useful mappings + + # map databaseId and standardName to each other - these should + # always be unique when treated case sensitively + + $self->{$kIdToStandardName}{$databaseId} = $standardName; # record the standard name for the database id + $self->{$kStandardNameToId}{$standardName} = $databaseId; # also make the reverse look up + + # Now map upper cased versions of the databaseId and name to their original form + # These are not guaranteed to be unique, so we use arrays instead + + push (@{$self->{$kUcIdToId}{uc($databaseId)}}, $databaseId); + push (@{$self->{$kUcStdNameToStdName}{uc($standardName)}}, $standardName); + + return 1; # mark +} + +############################################################################ +sub __storeGOID{ +############################################################################ +# This private method stores a GOID for a given databaseId, on a per +# aspect basis, in a hash. +# +# Usage: +# +# $self->__storeGOID($databaseId, $goid, $aspect); +# + + my ($self, $databaseId, $goid, $aspect) = @_; + + $self->{$kGoids}{$databaseId}{$aspect}{$goid} = undef; + +} + +=pod + +=head1 Public instance methods + +=head1 Some methods dealing with ambiguous names + +Because there are many names by which an annotated entity may be +referred to, that are non-unique, there exist a set of methods for +determining whether a name is ambiguous, and to what database +identifiers such ambiguous names may refer. + +Note, that the AnnotationParser is now case insensitive, but with some +caveats. For instance, you can use 'cdc6' to retrieve data for CDC6. +However, This if gene has been referred to as abc1, and another +referred to as ABC1, then these are treated as different, and +unambiguous. However, the text 'Abc1' would be considered ambiguous, +because it could refer to either. On the other hand, if a single gene +is referred to as XYZ1 and xyz1, and no other genes have that name (in +any casing), then Xyz1 would still be considered unambiguous. + +=cut + +############################################################################## +sub nameIsAmbiguous{ +############################################################################## + +=pod + +=head2 nameIsAmbiguous + +This public method returns a boolean to indicate whether a name is +ambiguous, i.e. whether the name might map to more than one entity (and +therefore more than one databaseId). + +NB: API change: + +nameIsAmbiguous is now case insensitive - that is, if there is a name +that is used twice using different casing, that will be treated as +ambiguous. Previous versions would have not treated these as +ambiguous. In the case that a name is provided in a certain casing, +which was encountered only once, then it will be treated as +unambiguous. This is the price of wanting a case insensitive +annotation parser... + +Usage: + + if ($annotationParser->nameIsAmbiguous($name)){ + + do something useful....or not.... + + } + +=cut + + my ($self, $name) = @_; + + die "You must supply a name to nameIsAmbiguous" if !defined ($name); + + # a name might appear in the hash of ambiguous names - however, + # it is possible that the provided name matches the case of one of + # the provided versions exactly, and thus may not be ambiguous + + # of course, it is also possible that there were actually more than + # one copy of that alias, with exactly the same casing, which would + # be ambiguous + + # thus, we need to find out whether the provided name matches the case + # of a something exactly, which refers to only one entity + + # a name being ambiguous boils down to whether it has been seen + # more than once in that exact case, or in the case that it has + # not been seen at all in that exact case, whether it is ambiguous + # in upper case form. + + my $isAmbiguous; + + if (!exists $self->{$kNameToCount}{$name}){ + + # we haven't seen this casing at all, so see if it's ambiguous + # in the uppercased version + + $isAmbiguous = exists $self->{$kAmbiguousNames}{uc($name)}; + + }elsif ($self->{$kNameToCount}{$name} > 1){ + + # we've seen this exact casing more than once, so it has to be + # ambiguous + + $isAmbiguous = 1; + +# # but if it is unambiguous as a common name, we will use that +# if ($annotation->nameIsStandardName($gene)) { +# if ($annotation->databaseIdByStandardName($gene)) { +# $isAmbiguous = 0; +# } +# } + + }else{ + + # it must only have ever been seen once in this exact casing, + # so it's unambiguous + + $isAmbiguous = 0; + + } + +# if ($isAmbiguous && $self->databaseIdByStandardName($name)) { +# # actually ambiguity is resolved since it is unambiguous as a standard name +# $isAmbiguous = 0; +# } + + return $isAmbiguous; + +} + +############################################################################ +sub databaseIdsForAmbiguousName{ +############################################################################ +=pod + +=head2 databaseIdsForAmbiguousName + +This public method returns an array of database identifiers for an +ambiguous name. If the name is not ambiguous, an empty list will be +returned. + +NB: API change: + +databaseIdsForAmbiguousName is now case insensitive - that is, if +there is a name that is used twice using different casing, that will +be treated as ambiguous. Previous versions would have not treated +these as ambiguous. However, if the name provided is of the exact +casing as a name that appeared only once with that exact casing, then +it is treated as unambiguous. This is the price of wanting a case +insensitive annotation parser... + +Usage: + + my @databaseIds = $annotationParser->databaseIdsForAmbiguousName($name); + +=cut + + my ($self, $name) = @_; + + die "You must supply a name to databaseIdsForAmbiguousName" if !defined ($name); + + if ($self->nameIsAmbiguous($name)){ + + return @{$self->{$kAmbiguousNames}{uc($name)}}; + + }else{ + + return (); + + } + +} + +############################################################################ +sub ambiguousNames{ +############################################################################ +=pod + +=head2 ambiguousNames + +This method returns an array of names, which from the annotation file +have been deemed to be ambiguous. + +Note - even though we have made the annotation parser case +insensitive, if something appeared in the annotations file as BLAH1 +and blah1, we would not deem either of these to be ambiguous. +However, if it appeared as blah1 twice, referring to two different +genes, then blah1 would be ambiguous. + +Usage: + + my @ambiguousNames = $annotationParser->ambiguousNames(); + +=cut + + my $self = shift; + + # we can simply generate a list of case-sensitive names that have + # appeared more than once - we'll cache them so they don't have to + # be recalculated in the event that they're asked for again + + if (!exists ($self->{$kAmbiguousNamesSensitive})){ + + my @names; + + foreach my $name (keys %{$self->{$kNameToCount}}){ + + push(@names, $name) if ($self->{$kNameToCount}{$name} > 1); + + } + + $self->{$kAmbiguousNamesSensitive} = \@names; + + } + + return @{$self->{$kAmbiguousNamesSensitive}}; + +} + +=pod + +=head1 Methods for retrieving GO annotations for entities + +=cut + +############################################################################ +sub goIdsByDatabaseId{ +############################################################################ +=pod + +=head2 goIdsByDatabaseId + +This public method returns a reference to an array of GOIDs that are +associated with the supplied databaseId for a specific aspect. If no +annotations are associated with that databaseId in that aspect, then a +reference to an empty array will be returned. If the databaseId is +not recognized, then undef will be returned. In the case that a +databaseId is ambiguous (for instance the same databaseId exists but +with different casings) then if the supplied database id matches the +exact case of one of those supplied, then that is the one it will be +treated as. In the case where the databaseId matches none of the +possibilities by case, then a fatal error will occur, because the +provided databaseId was ambiguous. + +Usage: + + my $goidsRef = $annotationParser->goIdsByDatabaseId(databaseId => $databaseId, + aspect => ); + +=cut + + my ($self, %args) = @_; + + my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); + my $databaseId = $args{'databaseId'} || $self->_handleMissingArgument(argument => 'databaseId'); + + my $mappedId; # will store the id as listed in the annotations file + + if (exists $self->{$kUcIdToId}{uc($databaseId)}){ # we recognize it + + if (scalar (@{$self->{$kUcIdToId}{uc($databaseId)}}) == 1){ + + # it's unambiguous + + $mappedId = $self->{$kUcIdToId}{uc($databaseId)}[0]; + + }else{ + + # it may be ambiguous, but we'll check to see if the provided one + # is of exactly the correct case + + foreach my $id (@{$self->{$kUcIdToId}{uc($databaseId)}}){ + + if ($databaseId eq $id){ # we have a match + + $mappedId = $id; + last; + + } + + } + + if (!defined $mappedId){ + + # we got no perfect match, so it's ambiguous, and we die + + die "ERROR: $databaseId is ambiguous as a databaseId, and could be used to refer to one of:\n\n". + join("\n", @{$self->{$kUcIdToId}{uc($databaseId)}}); + + } + + } + + }else{ # we don't recognize it + + return ; # note return here + + } + + # if we get here, then we have a recognized, and unambiguous database id + + return $self->_goIdsByMappedDatabaseId(databaseId => $mappedId, + aspect => $aspect); + +} + +############################################################################ +sub _goIdsByMappedDatabaseId{ +############################################################################ +# This protected method returns a reference to an array of GOIDs that +# are associated with the supplied databaseId for a specific aspect. +# If no annotations are associated with that databaseId in that +# aspect, then a reference to an empty array will be returned. If the +# databaseId is not recognized, then undef will be returned. The +# supplied databaseId must NOT be ambiguous, i.e. it must be a real +# databaseId known to exist. If it is possibly ambiguous, use the +# goIdsByDatabaseId method instead. +# +# Usage: +# +# my $goidsRef = $annotationParser->_goIdsByMappedDatabaseId(databaseId => $databaseId, +# aspect => ); + + + my ($self, %args) = @_; + + my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); + my $mappedId = $args{'databaseId'} || $self->_handleMissingArgument(argument => 'databaseId'); + + if (exists $self->{$kGoids}{$mappedId}{$aspect}){ # it has annotations + + return [keys %{$self->{$kGoids}{$mappedId}{$aspect}}]; + + }else{ # it has no annotations + + return []; # reference to empty array + + } + +} + +############################################################################ +sub goIdsByStandardName{ +############################################################################ +=pod + +=head2 goIdsByStandardName + +This public method returns a reference to an array of GOIDs that are +associated with the supplied standardName for a specific aspect. If +no annotations are associated with the entity with that standard name +in that aspect, then a reference to an empty list will be returned. +If the supplied name is not used as a standard name, then undef will +be returned. In the case that the supplied standardName is ambiguous +(for instance the same standardName exists but with different casings) +then if the supplied standardName matches the exact case of one of +those supplied, then that is the one it will be treated as. In the +case where the standardName matches none of the possibilities by case, +then a fatal error will occur, because the provided standardName was +ambiguous. + +Usage: + + my $goidsRef = $annotationParser->goIdsByStandardName(standardName =>$standardName, + aspect =>); + +=cut + + my ($self, %args) = @_; + + my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); + my $standardName = $args{'standardName'} || $self->_handleMissingArgument(argument => 'standardName'); + + # now we have to determine if the standardName is ambiguous or not + + # first, return if there is no standard name for the supplied string + + return undef if !exists $self->{$kUcStdNameToStdName}{uc($standardName)}; + + # now see if we have 1 or more mappings + + my $mappedName; + + if (scalar @{$self->{$kUcStdNameToStdName}{uc($standardName)}} == 1){ + + # we have a single mapping + + $mappedName = $self->{$kUcStdNameToStdName}{uc($standardName)}[0]; + + }else{ + + # there's more than one, so see if the case matched exactly + + foreach my $name (@{$self->{$kUcStdNameToStdName}{uc($standardName)}}){ + + if ($name eq $standardName){ + + $mappedName = $name; + last; + + } + + } + + if (!defined $mappedName){ + + # we got no perfect match, so it's ambiguous, and we die + + die "ERROR: $standardName is ambiguous as a standardName, and could be used to refer to one of:\n\n". + join("\n", @{$self->{$kUcStdNameToStdName}{uc($standardName)}}); + + } + + } + + # now we're here, we know we have a mapped standard name, which + # must thus map to a databaseId + + my $databaseId = $self->_databaseIdByMappedStandardName($mappedName); + + return $self->_goIdsByMappedDatabaseId(databaseId => $databaseId, + aspect => $aspect); + +} + +############################################################################ +sub goIdsByName{ +############################################################################ +=pod + +=head2 goIdsByName + +This public method returns a reference to an array of GO IDs that are +associated with the supplied name for a specific aspect. If there are +no GO associations for the entity corresponding to the supplied name +in the provided aspect, then a reference to an empty list will be +returned. If the supplied name does not correspond to any entity, +then undef will be returned. Because the name can be any of the +databaseId, the standard name, or any of the aliases, it is possible +that the name might be ambiguous. Clients of this object should first +test whether the name they are using is ambiguous, using the +nameIsAmbiguous() method, and handle it accordingly. If an ambiguous +name is supplied, then it will die. + +NB: API change: + +goIdsByName is now case insensitive - that is, if there is a name that +is used twice using different casing, that will be treated as +ambiguous. Previous versions would have not treated these as +ambiguous. This is the price of wanting a case insensitive annotation +parser. In the event that a name is provided that is ambiguous +because of case, if it matches exactly the case of one of the possible +matches, it will be treated unambiguously. + +Usage: + + my $goidsRef = $annotationParser->goIdsByName(name => $name, + aspect => ); + +=cut + + my ($self, %args) = @_; + + my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); + my $name = $args{'name'} || $self->_handleMissingArgument(argument => 'name'); + + die "You have supplied an ambiguous name to goIdsByName" if ($self->nameIsAmbiguous($name)); + + # if we get here, the name is not ambiguous, so it's safe to call + # databaseIdByName + + my $databaseId = $self->databaseIdByName($name); + + return undef if !defined $databaseId; # there is no such name + + # we should have a databaseId in the correct casing now + + return $self->_goIdsByMappedDatabaseId(databaseId => $databaseId, + aspect => $aspect); + +} + +=pod + +=head1 Methods for mapping different types of name to each other + +=cut + +############################################################################ +sub standardNameByDatabaseId{ +############################################################################ +=pod + +=head2 standardNameByDatabaseId + +This method returns the standard name for a database id. + +NB: API change + +standardNameByDatabaseId is now case insensitive - that is, if there +is a databaseId that is used twice (or more) using different casing, +it will be treated as ambiguous. Previous versions would have not +treated these as ambiguous. This is the price of wanting a case +insensitive annotation parser. In the event that a name is provided +that is ambiguous because of case, if it matches exactly the case of +one of the possible matches, it will be treated unambiguously. + +Usage: + + my $standardName = $annotationParser->standardNameByDatabaseId($databaseId); + +=cut + + my ($self, $databaseId) = @_; + + die "You must supply a databaseId to standardNameByDatabaseId" if !defined ($databaseId); + + # first return if there is no databaseId for the supplied string + + return undef if (!exists $self->{$kUcIdToId}{uc($databaseId)}); + + # now, check whether it's ambiguous as a databaseId + + my $mappedId; + + if (scalar(@{$self->{$kUcIdToId}{uc($databaseId)}}) == 1){ + + # we have a single mapping + + $mappedId = $self->{$kUcIdToId}{uc($databaseId)}[0]; + + }else{ + + # there's more than one, so see if the provided case matches + # exactly one of them + + foreach my $id (@{$self->{$kUcIdToId}{uc($databaseId)}}){ + + if ($databaseId eq $id){ + + $mappedId = $id; + last; + + } + + } + + if (!defined $mappedId){ + + # we got no perfect match, so it's ambiguous, and we die + + die "ERROR: $databaseId is ambiguous as a databaseId, and could be used to refer to one of:\n\n". + join("\n", @{$self->{$kUcIdToId}{uc($databaseId)}}); + + } + + } + + + return ($self->{$kIdToStandardName}{$mappedId}); + +} + +############################################################################ +sub databaseIdByStandardName{ +############################################################################ +=pod + +=head2 databaseIdByStandardName + +This method returns the database id for a standard name. + +NB: API change + +databaseIdByStandardName is now case insensitive - that is, if there +is a standard name that is used twice (or more) using different +casing, it will be treated as ambiguous. Previous versions would have +not treated these as ambiguous. This is the price of wanting a case +insensitive annotation parser. In the event that a name is provided +that is ambiguous because of case, if it matches exactly the case of +one of the possible matches, it will be treated unambiguously. + +Usage: + + my $databaseId = $annotationParser->databaseIdByStandardName($standardName); + +=cut + + my ($self, $standardName) = @_; + + die "You must supply a standardName to databaseIdByStandardName" if !defined ($standardName); + + # first return if there is no standard name for the supplied string + + return undef if (!exists $self->{$kUcStdNameToStdName}{uc($standardName)}); + + # now see if it's ambiguous or not + + my $mappedStandardName; + + if (scalar(@{$self->{$kUcStdNameToStdName}{uc($standardName)}}) == 1){ + + # it's not ambiguous + + $mappedStandardName = $self->{$kUcStdNameToStdName}{uc($standardName)}[0]; + + }else{ + + # there's more than one, so see if the supplied name matches + # the case of one of them exactly + + foreach my $name (@{$self->{$kUcStdNameToStdName}{uc($standardName)}}){ + + if ($standardName eq $name){ + + # added a check to detect more than one instance + # of this standard name in this exact case -- mark + + if ($mappedStandardName) { + # more than one + $mappedStandardName = undef; + last; + } + + $mappedStandardName = $name; +# last; + + } + + } + + if (!defined $mappedStandardName){ + +# die "$standardName is ambiguous as a standard name, and could be used to refer to one of:\n\n". +# join("\n", @{$self->{$kUcStdNameToStdName}{uc($standardName)}}); +# print STDERR "$standardName is ambiguous as a standard name, and does not exactly match the case of any name. It could be one of:\n", +# join("\n", @{$self->{$kUcStdNameToStdName}{uc($standardName)}}), "\n"; + print STDERR "$standardName is ambiguous as a standard name\n"; + return undef; + + } + + } + +# return ($self->{$kStandardNameToId}{$standardName}); +#! I think this is correct - otherwise a different casing for a non-ambiguous name will return undef - mark +#print "XX standardName $standardName, mapped $mappedStandardName, ID from mapped ".$self->{$kStandardNameToId}{$mappedStandardName}."\n" if (!$self->{$kStandardNameToId}{$standardName}); + return ($self->{$kStandardNameToId}{$mappedStandardName}); + +} + +############################################################################ +sub _databaseIdByMappedStandardName{ +############################################################################ +# This protected method returns the database id for a standard name that is +# guaranteed to be non-ambiguous, and in the correct casing +# +# Usage: +# +# my $databaseId = $annotationParser->_databaseIdByMappedStandardName($standardName); +# + + my ($self, $standardName) = @_; + + die "You must supply a standardName to _databaseIdByMappedStandardName" if !defined ($standardName); + + return ($self->{$kStandardNameToId}{$standardName}); + +} + +############################################################################ +sub databaseIdByName{ +############################################################################ +=pod + +=head2 databaseIdByName + +This method returns the database id for any identifier for a gene +(e.g. by databaseId itself, by standard name, or by alias). If the +used name is ambiguous, then the program will die. Thus clients +should call the nameIsAmbiguous() method, prior to using this method. +If the name does not map to any databaseId, then undef will be +returned. + +NB: API change + +databaseIdByName is now case insensitive - that is, if there is a name +that is used twice using different casing, that will be treated as +ambiguous. Previous versions would have not treated these as +ambiguous. This is the price of wanting a case insensitive annotation +parser. In the event that a name is provided that is ambiguous +because of case, if it matches exactly the case of one of the possible +matches, it will be treated unambiguously. + +Usage: + + my $databaseId = $annotationParser->databaseIdByName($name); + +=cut + + my ($self, $name) = @_; + + die "You must supply a name to databaseIdByName" if !defined ($name); + +# die "You have supplied an ambiguous name to databaseIdByName" if ($self->nameIsAmbiguous($name)); + return undef if ($self->nameIsAmbiguous($name)); + + # give them the case insensitive unique map, or if there is none, + # then the case sensitive version + + my $databaseId = $self->{$kNameToIdMapInsensitive}{uc($name)} || $self->{$kNameToIdMapSensitive}{$name}; + + return $databaseId; + +} + +############################################################################ +sub standardNameByName{ +############################################################################ +=pod + +=head2 standardNameByName + +This public method returns the standard name for the the gene +specified by the given name. Because a name may be ambiguous, the +nameIsAmbiguous() method should be called first. If an ambiguous name +is supplied, then it will die with an appropriate error message. If +the name does not map to a standard name, then undef will be returned. + +NB: API change + +standardNameByName is now case insensitive - that is, if there is a +name that is used twice using different casing, that will be treated +as ambiguous. Previous versions would have not treated these as +ambiguous. This is the price of wanting a case insensitive annotation +parser. + +Usage: + + my $standardName = $annotationParser->standardNameByName($name); + +=cut + + my ($self, $name) = @_; + + die "You must supply a name to standardNameByName" if !defined ($name); + + die "You have supplied an ambiguous name to standardNameByName" if ($self->nameIsAmbiguous($name)); + + my $databaseId = $self->databaseIdByName($name); + + if (defined $databaseId){ + + return $self->{$kIdToStandardName}{$databaseId}; + + }else{ + + return undef; + + } + +} + +=pod + +=head1 Other methods relating to names + +=cut + +############################################################################ +sub nameIsStandardName{ +############################################################################ +=pod + +=head2 nameIsStandardName + +This method returns a boolean to indicate whether the supplied name is +used as a standard name. + +NB : API change. + +This is now case insensitive. If you provide abC1, and ABc1 is a +standard name, then it will return true. + +Usage : + + if ($annotationParser->nameIsStandardName($name)){ + + # do something + + } + +=cut + + my ($self, $name) = @_; + + die "You must supply a name to nameIsStandardName" if !defined($name); + + return exists ($self->{$kUcStdNameToStdName}{uc($name)}); + +} + +############################################################################ +sub nameIsDatabaseId{ +############################################################################ +=pod + +=head2 nameIsDatabaseId + +This method returns a boolean to indicate whether the supplied name is +used as a database id. + +NB : API change. + +This is now case insensitive. If you provide abC1, and ABc1 is a +database id, then it will return true. + +Usage : + + if ($annotationParser->nameIsDatabaseId($name)){ + + # do something + + } + +=cut + + + my ($self, $databaseId) = @_; + + die "You must supply a potential databaseId to nameIsDatabaseId" if !defined($databaseId); + + return exists ($self->{$kUcIdToId}{uc($databaseId)}); + +} + +############################################################################ +sub nameIsAnnotated{ +############################################################################ +=pod + +=head2 nameIsAnnotated + +This method returns a boolean to indicate whether the supplied name has any +annotations, either when considered as a databaseId, a standardName, or +an alias. If an aspect is also supplied, then it indicates whether that +name has any annotations in that aspect only. + +NB: API change. + +This is now case insensitive. If you provide abC1, and ABc1 has +annotation, then it will return true. + +Usage : + + if ($annotationParser->nameIsAnnotated(name => $name)){ + + # blah + + } + +or: + + if ($annotationParser->nameIsAnnotated(name => $name, + aspect => $aspect)){ + + # blah + + } + + +=cut + + my ($self, %args) = @_; + + my $name = $args{'name'} || die "You must supply a name to nameIsAnnotated"; + + my $aspect = $args{'aspect'}; + + my $isAnnotated = 0; + + my $ucName = uc($name); + + if (!defined ($aspect)){ # if there's no aspect + + $isAnnotated = (exists ($self->{$kNameToIdMapInsensitive}{$ucName}) || exists ($self->{$kAmbiguousNames}{$ucName})); + + }else{ + + if ($self->nameIsDatabaseId($name) && @{$self->goIdsByDatabaseId(databaseId => $name, + aspect => $aspect)}){ + + $isAnnotated = 1; + + }elsif ($self->nameIsStandardName($name) && @{$self->goIdsByStandardName(standardName => $name, + aspect => $aspect)}){ + + $isAnnotated = 1; + + }elsif (!$self->nameIsAmbiguous($name)){ + + my $goidsRef = $self->goIdsByName(name => $name, + aspect => $aspect); + + if (defined $goidsRef && @{$goidsRef}){ + + $isAnnotated = 1; + + } + + }else { # MUST be an ambiguous name, that's not used as a standard name + + foreach my $databaseId ($self->databaseIdsForAmbiguousName($name)){ + + if (@{$self->goIdsByDatabaseId(databaseId => $name, + aspect => $aspect)}){ + + $isAnnotated = 1; + last; # as soon as we know, we can finish + + } + + } + + } + + } + + return $isAnnotated; + +} + +=pod + +=head1 Other public methods + +=cut + +############################################################################ +sub databaseName{ +############################################################################ +=pod + +=head2 databaseName + +This method returns the name of the annotating authority from the file +that was supplied to the constructor. + +Usage : + + my $databaseName = $annotationParser->databaseName(); + +=cut + + my $self = shift; + + return $self->{$kDatabaseName}; + +} + +############################################################################ +sub numAnnotatedGenes{ +############################################################################ +=pod + +=head2 numAnnotatedGenes + +This method returns the number of entities in the annotation file that +have annotations in the supplied aspect. If no aspect is provided, +then it will return the number of genes with an annotation in at least +one aspect of GO. + +Usage: + + my $numAnnotatedGenes = $annotationParser->numAnnotatedGenes(); + + my $numAnnotatedGenes = $annotationParser->numAnnotatedGenes($aspect); + +=cut + + my ($self, $aspect) = @_; + + if (defined ($aspect)){ + + return $self->{$kNumAnnotatedGenes}{$aspect}; + + }else{ + + return $self->{$kTotalNumAnnotatedGenes}; + + } + +} + +############################################################################ +sub allDatabaseIds{ +############################################################################ +=pod + +=head2 allDatabaseIds + +This public method returns an array of all the database identifiers + +Usage: + + my @databaseIds = $annotationParser->allDatabaseIds(); + +=cut + + my $self = shift; + + return keys (%{$self->{$kIdToStandardName}}); + +} + +############################################################################ +sub allStandardNames{ +############################################################################ +=pod + +=head2 allStandardNames + +This public method returns an array of all standard names. + +Usage: + + my @standardNames = $annotationParser->allStandardNames(); + +=cut + + my $self = shift; + + return keys(%{$self->{$kStandardNameToId}}); + +} + +=pod + +=head1 Methods to do with files + +=cut + +############################################################################ +sub file{ +############################################################################ +=pod + +=head2 file + +This method returns the name of the file that was used to instantiate +the object. + +Usage: + + my $file = $annotationParser->file; + +=cut + + return $_[0]->{$kFileName}; + +} + +############################################################################ +sub serializeToDisk{ +############################################################################ +=pod + +=head2 serializeToDisk + +This public method saves the current state of the Annotation Parser +Object to a file, using the Storable package. The data are saved in +network order for portability, just in case. The name of the object +file is returned. By default, the name of the original file will be +used to make the name of the object file (including the full path from +where the file came), or the client can instead supply their own +filename. + +Usage: + + my $fileName = $annotationParser->serializeToDisk; + + my $fileName = $annotationParser->serializeToDisk(filename => $filename); + +=cut + + my ($self, %args) = @_; + + my $fileName; + + if (exists ($args{'filename'})){ # they supply their own filename + + $fileName = $args{'filename'}; + + }else{ # we build a name from the file used to instantiate ourselves + + $fileName = $self->file; + + if ($fileName !~ /\.obj$/){ # if we weren't instantiated from an object + + $fileName .= ".obj"; # add a .obj suffix to the name + + } + + } + + nstore ($self, $fileName) || die "$PACKAGE could not serialize itself to $fileName : $!"; + + return ($fileName); + +} + +1; # to keep perl happy + +############################################################################ +# MORE P O D D O C U M E N T A T I O N # +############################################################################ + +=pod + +=head1 Modifications + + # This script was modified by mark to filter by evidence code. Updates + # to the module were merged. + +CVS info is listed here: + + # $Author: mark $ + # $Date: 2015/02/09 18:36:55 $ + # $Log: AnnotationParser.pm,v $ + # Revision 1.13 2015/02/09 18:36:55 mark + # databaseIdByName now returns undef for ambiguous names instead of dying + # + # Revision 1.12 2014/10/15 20:08:18 mark + # correct TAIR gene link + # + # Revision 1.11 2012/11/21 21:31:52 mark + # Fixed bug where ambiguity was not detected when standard name of exact same case mapped to more than one database ID + # + # Revision 1.10 2011/06/03 18:01:40 mark + # added validation date parsing and methods + # + # Revision 1.9 2011/04/20 20:23:18 mark + # don't die if ambiguous standard name, return undef instead + # + # Revision 1.8 2010/03/09 21:29:13 mark + # added error detection and reporting to progress + # + # Revision 1.7 2010/03/08 03:01:59 mark + # fix background bug + # + # Revision 1.6 2009/08/12 05:07:50 mark + # more evidence code counting + # + # Revision 1.5 2009/07/29 05:19:20 mark + # fixed bug with uninitialized value when printing evidence code counts, added version method and parsing + # + # Revision 1.4 2009/07/14 21:08:51 mark + # print evidence codes, return evidence code counts + # + # Revision 1.3 2009/05/18 22:10:17 mark + # more evidence code support, added REPLACE_DBID support + # + # Revision 1.2 2009/05/05 17:53:37 mark + # use obo files, remove duplicate IDs + # + # Revision 1.35 2008/05/13 23:06:16 sherlock + # updated to fix bug with querying with a name that was unambiguous when + # taking its casing into account. + # + # Revision 1.34 2007/03/18 03:09:05 sherlock + # couple of PerlCritic suggested improvements, and an extra check to + # make sure that the cardinality between standard names and database ids + # is 1:1 + # + # Revision 1.33 2006/07/28 00:02:14 sherlock + # fixed a couple of typos + # + # Revision 1.32 2004/07/28 17:12:10 sherlock + # bumped version + # + # Revision 1.31 2004/07/28 17:03:49 sherlock + # fixed bugs when calling goidsByDatabaseId instead of goIdsByDatabaseId + # on lines 1592 and 1617 - thanks to lfriedl@cs.umass.edu for spotting this. + # + # Revision 1.30 2003/11/26 18:44:28 sherlock + # finished making all the changes that were required to make it case + # insensitive, and modified POD accordingly. It appears to all work as + # expected... + # + # Revision 1.29 2003/11/22 00:05:05 sherlock + # made a very large number of changes to make much of it + # case-insensitive, such that using CDC6 or cdc6 amounts to the same + # query, as long as both versions of that name don't exist in the + # annotations file. Still needs a little work to allow names that are + # potentially ambiguous to be not ambiguous, if their casing matches + # exactly one form of the name that has been seen. Have started to + # update test suite to check all the case insensitive stuff, but is not + # yet finished. + # + # + +=head1 AUTHORS + +Elizabeth Boyle, ell@mit.edu + +Gavin Sherlock, sherlock@genome.stanford.edu + +=cut diff --git a/www/lib/GO/AnnotationProvider/MetaData.pm b/www/lib/GO/AnnotationProvider/MetaData.pm new file mode 100644 index 0000000..844e3d1 --- /dev/null +++ b/www/lib/GO/AnnotationProvider/MetaData.pm @@ -0,0 +1,1539 @@ +package GO::AnnotationProvider::MetaData; + +############################################################################## +# FILE: MetaData.pm +# DATE CREATED: August 2004 +# AUTHOR: Linda McMahan +# +############################################################################## + +############################################################################## +# POD (PLAIN OLD DOCUMENTATION) +############################################################################## + +=pod + +=head1 NAME + +GO::AnnotationProvider::MetaData - Provide attributes and methods to deal with gene annotation files available at GO ftp site. + +=head1 SYNOPSIS + +=over 4 + + use GO::AnnotationProvider::AnnotationParser; + use go::AnnotationProvider::MetaData; + + + my $metaData = GO::AnnotationProvider::MetaData->new(); + + # Get this organism's total annotated gene number from its gene association file + my $annotionFilePath = '/usr/local/go/lib/GO/gene_association.sgd'; + my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(annotationFile=>$annotionFilePath); + my @databaseIds = $annotationParser->allDatabaseIds(); + my $annotateGeneNum = scalar(@databaseIds); + + my $annotionFile = 'gene_association.sgd'; + + # Call these mutator methods to set values of these attributes for this organism + $metaData->setANNOTATION_FILE($file, $annotationFile); + $metaData->setANNOTATE_GENE_NUM($annotateGeneNum, $annotationFile); + $metaData->setESTIMATE_GENE_NUM($annotationFile); + + # Call these accessor methods to return values of these attributes for this organism + my ($organismGeneUrl) = $metaData->getGENE_URL($annotationFile); + my ($organismCommonName) = $metaData->getCOMMON_NAME($annotationFile); + my ($ogranismScientificName) = $metaData->getORGANISM($annotationFile); + my ($organismDatabaseName) = $metaData->getORGANISM_DB_NAME($annotationFile); + my ($organismDatabaseUrl) = $metaData->getORGANISM_DB_URL($annotationFile); + + print "Organism Gene Url: ", $organismGeneUrl, "\n", + "Organism Common Name: ", $organismCommonName, "\n", + "Organism Scientific Name: ", $ogranismScientificName, "\n", + "Organism Database Name: ", $organismDatabaseName, "\n", + "Organism Database URL: ", $organismDatabaseUrl, "\n"; + + # Call this accessor method to return a reference of all attributes for this organism + my ($organismInfoHashRef) = $metaData->getOrganismInfo($annotationFile); + # @organismInfoArray contains references to the info about all the organisms + my @organismsInfoArray; + push(@organismsInfoArray, $organismInfoHashRef); + + # Call this method to create an html table of gene association file for these files + my $filepath = "/Genomics/curtal/www/go/html/GOTermMapper/MetaData.html"; + $metaData->createGeneAsscTable(\@organismsInfoArray, $filepath); + + # Call this method to get the values and labels of gene annotation files. These returned values and + # labels can then be used to create cgi script's pull-down menu of gene annotation files + my ($geneAssValuesArrayRef, $geneAssLabelsHashRef) = $metaData->geneAnnotationFilesMenu; + + # Call this method to get the estimate total gene number for this organism + my ($estimateGeneNum) = $metaData->organismEstimateTotalGeneNum($annotionFile); + + # Call this method to get the gene url for this organism + my $opt_u = 'http://genome-www4.stanford.edu/cgi-bin/SGD/locus.pl?locus='; + my ($geneUrl) = $metaData->organismGeneUrl($annotationFile, $opt_u); + + # Call this method to check if the gene annotation file exists for this organism + my ($organismFileExist) = $metaData->organismFileExist($annotationFile); + +=back + +=cut + +############################################################################## + + +use strict; +use warnings; +use vars '$AUTOLOAD'; #keep 'use strict' happy +use Carp; + +use CGI qw/:all :html3/; + +use vars qw (@ISA @EXPORT_OK %dir); #keep 'use strict' happy +use Exporter; +@ISA = ('Exporter'); +@EXPORT_OK = qw(createGeneAsscTable geneAnnotationFilesMenu geneAnnotationSlimFilesMenu organismEstimateTotalGeneNum organismGeneUrl organismFileExist); + + +my $DEBUG = 0; #debugging variable; set to 1 for printed feedback (i.e. to print debug statements) + + +############################################################################### +## ATTRIBUTES OF ORGANISMS +############################################################################### +## we annotate gene product to the organism rather than to the +## annotation file, on the premise that there can be multiple +## annotations files per organism (sources, custom, etc,) +## + +my %organismEstimateTotalGeneNum = ( +# 'Arabidopsis thaliana' => 52816, + 'Bacillus anthracis' => 5507, + 'Bos taurus' => 37225, + 'Caenorhabditis elegans' => 22246, +# 'Candida albicans' => 6840, + 'Coxiella burnetii' => 2095, + 'Danio rerio' => 22409, + 'Dictyostelium discoideum' => 12098, + 'Drosophila melanogaster' => 16085, + 'Escherichia coli' => 7187, + 'Gallus gallus' => 30837, + 'Geobacter sulfurreducens PCA' => 3533, +# 'Homo sapiens' => 23531, +# 'Mus musculus' => 33884, + 'Oryza sativa' => 41521, + 'Plasmodium falciparum' => 5400, + 'Pseudomonas syringae' => 5763, +# 'Rattus norvegicus' => 23751, + 'Saccharomyces cerevisiae' => 7166, + 'Shewanella oneidensis' => 4843, + 'Trypanosoma brucei' => 708, + 'Vibrio cholerae' => 3885, + ); + + +############################################################################### +## ATTRIBUTES OF GO GENE ANNOTATION FILES +############################################################################### +## associations files have the following STATIC attributes (defined): +# +# +# GENE_URL - a template URL to link to for more inforamtion on a gene +# COMMON_NAME (Note, this should probably be an attribute of the organism; oh well...) +# ORGANISM - Genus and species (must match keys of organismEstimateTotalGeneNum, above.) +# ORGANISM_DB_NAME - annotations authority or file source +# ORGANISM_DB_URL - URL of authority +# DEFAULT_GENE_URL - a sample link +# README_URL - README file from authority located at GO site, http://www.geneontology.org/gene-associations/readme/ +# IDENTIFIER_2 - type of identifier at column 2 of association file (e.g. 'systematic name', 'gene name', 'gene symbol') +# IDENTIFIER_2_EXAMPLE - example identifier at column 2 of association file +# IDENTIFIER_3 - type of identifier at column 3 of association file +# IDENTIFIER_3_EXAMPLE - example identifier at column 3 of association file +# IDENTIFIER_11 - type of identifier at column 11 of association file +# IDENTIFIER_11_EXAMPLE - example identifier at column 11 of association file +# SAMPLE_GENE_LIST - sample list of identifiers, usable by the tools +# CONVERSION_TOOL - {a reference to an hash of label => url}, suggested to assist in identifier conversion +# ANNOTATION_FILE => the full name of the annotation file +# +## the following attributes are only set is the gene assocaition files +## are usable by the web applications, GOTermFinder, GOTermMapper +# MENU_LABEL => the menu label the the GOTermFinder will display +# SLIM_MENU_LABEL => the menu label the the GOTermMapper will display +# +## the following attribute is set AFTER the file as been parsed +# ANNOTATE_GENE_NUM - the number of annotated gene products in the association file +# +## and the following attribute may be set if it is defined in % organismEstimateTotalGeneNum, above +# ESTIMATE_GENE_NUM - the estimated number of gene products in the organism, both annotated and unannotated +# + +my $uniprot = {'UniProt Batch Retrieval' => 'http://www.uniprot.org/batch/', + 'CNIO Clone/Gene ID converter' => 'http://idconverter.bioinfo.cnio.es/'}; + +my %_annotationFiles = ( + GeneDB_Lmajor => { + GENE_URL => undef, + COMMON_NAME => 'Skin parasite', + ORGANISM => 'Leishmania major', + ORGANISM_DB_NAME => 'L. major GeneDB', + ORGANISM_DB_URL => 'http://www.genedb.org/Homepage/Lmajor', + DEFAULT_GENE_URL => '', + README_URL => 'http://www.geneontology.org/gene-associations/readme/GeneDB_Lmajor.README', + IDENTIFIER_2 => 'Systematic_ID', + IDENTIFIER_3 => 'Systematic_ID', + IDENTIFIER_11 => '', + IDENTIFIER_2_EXAMPLE => 'LM5.39', + IDENTIFIER_3_EXAMPLE => 'L6071.09', + IDENTIFIER_11_EXAMPLE => '', + SAMPLE_GENE_LIST => '/sampleGeneFiles/geneDB_Lmajor.txt', + ANNOTATION_FILE => 'gene_association.GeneDB_Lmajor', + MENU_LABEL => 'GeneDB_Lmajor', + SLIM_MENU_LABEL => 'GeneDB_Lmajor (Generic GO slim)' + }, + GeneDB_Pfalciparum => { + GENE_URL => 'http://www.genedb.org/genedb/Search?organism=malaria&name=', + COMMON_NAME => 'Malaria parasite', + ORGANISM => 'Plasmodium falciparum', + ORGANISM_DB_NAME => 'P. falciparum GeneDB', + ORGANISM_DB_URL => 'http://www.genedb.org/Homepage/Pfalciparum', + DEFAULT_GENE_URL => 'http://www.genedb.org/genedb/Search?organism=malaria&name'.'=PFF1145c', + README_URL => 'http://www.geneontology.org/gene-associations/readme/GeneDB_Pfalciparum.README', + IDENTIFIER_2 => 'Systematic Name', + IDENTIFIER_3 => 'Systematic Name', + IDENTIFIER_11 => '', + IDENTIFIER_2_EXAMPLE => 'PFF1145c', + IDENTIFIER_3_EXAMPLE => 'MAL6P1.191', + IDENTIFIER_11_EXAMPLE => '', + SAMPLE_GENE_LIST => '/sampleGeneFiles/geneDB_pfalciparum.txt', + ANNOTATION_FILE => 'gene_association.GeneDB_Pfalciparum', + MENU_LABEL => 'GeneDB_Pfalciparum', + SLIM_MENU_LABEL => 'GeneDB_Pfalciparum (Generic GO slim)' + }, + # GeneDB_Spombe => { + # GENE_URL => 'http://www.genedb.org/genedb/Search?organism=pombe&name=', + # COMMON_NAME => 'Yeast', + # ORGANISM => 'Schizosaccharomyces pombe', + # ORGANISM_DB_NAME => 'S. pombe GeneDB', + # ORGANISM_DB_URL => 'http://www.genedb.org/genedb/pombe/', + # DEFAULT_GENE_URL => 'http://www.genedb.org/genedb/Search?organism'.'=pombe&name'.'=clp1', + # README_URL => 'http://www.geneontology.org/gene-associations/readme/GeneDB_Spombe.README', + # IDENTIFIER_2 => 'Systematic Name', + # IDENTIFIER_3 => 'Gene Name', + # IDENTIFIER_11 => 'Gene Synonym', + # IDENTIFIER_2_EXAMPLE => 'SPAC1782.09c', + # IDENTIFIER_3_EXAMPLE => 'clp1', + # IDENTIFIER_11_EXAMPLE => 'flp1', + # SAMPLE_GENE_LIST => '/sampleGeneFiles/geneDB_spombe.txt', + # ANNOTATION_FILE => 'gene_association.GeneDB_Spombe', + # MENU_LABEL => 'GeneDB_Spombe', + # SLIM_MENU_LABEL => 'GeneDB_Spombe (S. pombe GO slim) (Process only)' + # }, + # GeneDB_Spombe_generic => { + # ANNOTATION_FILE => 'gene_association.GeneDB_Spombe_generic', + # SLIM_MENU_LABEL => 'GeneDB_Spombe (Generic GO slim)' + # }, + pombase => { +# GENE_URL => 'http://www.genedb.org/genedb/Search?organism=pombe&name=', + GENE_URL => 'http://www.pombase.org/search/ensembl/', + COMMON_NAME => 'Yeast', + ORGANISM => 'Schizosaccharomyces pombe', +# ORGANISM_DB_NAME => 'S. pombe GeneDB', + ORGANISM_DB_NAME => 'PomBase', +# ORGANISM_DB_URL => 'http://www.genedb.org/genedb/pombe/', + ORGANISM_DB_URL => 'http://www.pombase.org/', + DEFAULT_GENE_URL => 'http://www.pombase.org/spombe/result/clp1', + README_URL => 'http://www.geneontology.org/gene-associations/readme/pombase.README', + IDENTIFIER_2 => 'Systematic Name', + IDENTIFIER_3 => 'Gene Name', + IDENTIFIER_11 => 'Gene Synonym', + IDENTIFIER_2_EXAMPLE => 'SPAC1782.09c', + IDENTIFIER_3_EXAMPLE => 'clp1', + IDENTIFIER_11_EXAMPLE => 'flp1', + SAMPLE_GENE_LIST => '/sampleGeneFiles/pombase.txt', + ANNOTATION_FILE => 'gene_association.pombase', + MENU_LABEL => 'PomBase', + SLIM_MENU_LABEL => 'PomBase (S. pombe GO slim) (Process only)' + }, + GeneDB_Spombe_generic => { + ANNOTATION_FILE => 'gene_association.pombase_generic', + SLIM_MENU_LABEL => 'PomBase (Generic GO slim)' + }, + GeneDB_Tbrucei => { + GENE_URL => 'http://www.genedb.org/genedb/Search?organism=tryp&name=', + COMMON_NAME => 'Trypanosome', + ORGANISM => 'Tryanosoma brucei', + ORGANISM_DB_NAME => 'T. brucei GeneDB', + ORGANISM_DB_URL => 'http://www.genedb.org/Homepage/Tbruceibrucei927', + DEFAULT_GENE_URL => 'http://www.genedb.org/genedb/Search?organism'.'=tryp&name'.'=Tb927.2.380', + README_URL => 'http://www.geneontology.org/gene-associations/readme/GeneDB_Tbrucei.README', + IDENTIFIER_2 => 'Systematic Name', + IDENTIFIER_3 => 'Gene Name', + IDENTIFIER_11 => 'Gene Synonym', + IDENTIFIER_2_EXAMPLE => 'Tb927.2.380', + IDENTIFIER_3_EXAMPLE => 'RHS2', + IDENTIFIER_11_EXAMPLE => '3B10.145', + SAMPLE_GENE_LIST => '/sampleGeneFiles/geneDB_Tbrucei.txt', + ANNOTATION_FILE => 'gene_association.GeneDB_Tbrucei', + MENU_LABEL => 'GeneDB_Tbrucei', + SLIM_MENU_LABEL => 'GeneDB_Tbrucei (Generic GO slim)' + }, + # GeneDB_tsetse => { + # GENE_URL => 'http://www.genedb.org/genedb/Search?organism=glossina&name=', + # COMMON_NAME => 'Tsetse fly', + # ORGANISM => 'Glossina morsitans', + # ORGANISM_DB_NAME => 'G. morsitans GeneDB', + # ORGANISM_DB_URL => 'http://www.genedb.org/genedb/glossina/index.jsp', + # DEFAULT_GENE_URL => 'http://www.genedb.org/genedb/Search?organism='.'glossina&name'.'=Gmm-1621', + # README_URL => 'http://www.geneontology.org/gene-associations/readme/GeneDB_Tsetse.README', + # IDENTIFIER_2 => 'Systematic Name', + # IDENTIFIER_3 => '', + # IDENTIFIER_11 => '', + # IDENTIFIER_2_EXAMPLE => 'Gmm-1621', + # IDENTIFIER_3_EXAMPLE => '', + # IDENTIFIER_11_EXAMPLE => '', + # SAMPLE_GENE_LIST => '/sampleGeneFiles/geneDB_tsetse.txt', + # ANNOTATION_FILE => 'gene_association.GeneDB_tsetse', + # MENU_LABEL => 'GeneDB_tsetse', + # SLIM_MENU_LABEL => 'GeneDB_tsetse (Generic GO slim)' + # }, + cgd => { + GENE_URL => 'http://www.candidagenome.org/cgi-bin/locus.pl?locus=', + COMMON_NAME => 'Candida', + ORGANISM => 'Candida albicans', + ORGANISM_DB_NAME => 'CGD', + ORGANISM_DB_URL => 'http://www.candidagenome.org/', + DEFAULT_GENE_URL => 'http://www.candidagenome.org/cgi-bin/locus.pl?locus='.'=act1', + README_URL => 'http://www.geneontology.org/gene-associations/readme/cgd.README', + IDENTIFIER_2 => 'CGD_ID', + IDENTIFIER_3 => 'Standard Name', + IDENTIFIER_11 => 'Systematic name', + IDENTIFIER_2_EXAMPLE => 'CAL0005516', + IDENTIFIER_3_EXAMPLE => 'AAF1', + IDENTIFIER_11_EXAMPLE => 'orf19.7436', + SAMPLE_GENE_LIST => '/sampleGeneFiles/cgd.txt', + ANNOTATION_FILE => 'gene_association.cgd', + MENU_LABEL => 'CGD (Candida - C. albicans)', + SLIM_MENU_LABEL => 'CGD - Candida (Generic GO slim)' + }, + dictyBase => { + GENE_URL => 'http://dictybase.org/db/cgi-bin/dictyBase/locus.pl?locus=', + COMMON_NAME => 'Slime mold', + ORGANISM => 'Dictyostelium discoideum', + ORGANISM_DB_NAME => 'DictyBase', + ORGANISM_DB_URL => 'http://dictybase.org/', + DEFAULT_GENE_URL => 'http://dictybase.org/db/cgi-bin/dictyBase/locus.pl?locus'.'=yakA', + README_URL => '', + IDENTIFIER_2 => 'DictyBase_ID', + IDENTIFIER_3 => 'Gene Name', + IDENTIFIER_11 => 'Alias', + IDENTIFIER_2_EXAMPLE => 'DDB_G0267450', + IDENTIFIER_3_EXAMPLE => 'yakA', + IDENTIFIER_11_EXAMPLE => 'dagB', + SAMPLE_GENE_LIST => '/sampleGeneFiles/dictyBase.txt', + ANNOTATION_FILE => 'gene_association.dictyBase', + MENU_LABEL => 'DictyBase (Slime mold - D. discoideum)', + SLIM_MENU_LABEL => 'DictyBase (Generic GO slim)' + }, +# ddb => { +# GENE_URL => 'http://dictybase.org/db/cgi-bin/dictyBase/locus.pl?locus=', +# COMMON_NAME => 'Slime mold', +# ORGANISM => 'Dictyostelium discoideum', +# ORGANISM_DB_NAME => 'DictyBase', +# ORGANISM_DB_URL => 'http://dictybase.org/', +# DEFAULT_GENE_URL => 'http://dictybase.org/db/cgi-bin/dictyBase/locus.pl?locus'.'=yakA', +# README_URL => '', +# IDENTIFIER_2 => 'DictyBase_ID', +# IDENTIFIER_3 => 'Gene Name', +# IDENTIFIER_11 => 'Alias', +# IDENTIFIER_2_EXAMPLE => 'DDB0191191', +# IDENTIFIER_3_EXAMPLE => 'yakA', +# IDENTIFIER_11_EXAMPLE => 'dagB', +# SAMPLE_GENE_LIST => '/sampleGeneFiles/ddb.txt', +# ANNOTATION_FILE => 'gene_association.ddb', +# MENU_LABEL => 'DictyBase (Slime mold - D. discoideum)', +# SLIM_MENU_LABEL => 'DictyBase (Generic GO slim)' +# }, + fb => { + GENE_URL => 'http://flybase.bio.indiana.edu/.bin/fbidq.html?', + COMMON_NAME => 'Fruit fly', + ORGANISM => 'Drosophila melanogaster', + ORGANISM_DB_NAME => 'FlyBase', + ORGANISM_DB_URL => 'http://flybase.bio.indiana.edu/', + DEFAULT_GENE_URL => 'http://flybase.bio.indiana.edu/.bin/fbidq.html?FBgn0013749', + README_URL => 'http://www.geneontology.org/gene-associations/readme/fb.README', + IDENTIFIER_2 => 'FlyBase_ID', + IDENTIFIER_3 => 'Gene Symbol', + IDENTIFIER_11 => 'Gene Synonym', + IDENTIFIER_2_EXAMPLE => 'FBgn0013749', + IDENTIFIER_3_EXAMPLE => 'Arf102F', + IDENTIFIER_11_EXAMPLE => 'ARF2', + SAMPLE_GENE_LIST => '/sampleGeneFiles/fb.txt', + CONVERSION_TOOL => '', + ANNOTATION_FILE => 'gene_association.fb', + MENU_LABEL => 'FlyBase (Fruit fly - D. melanogaster)', + SLIM_MENU_LABEL => 'FlyBase (Generic GO slim)' + }, + goa_arabidopsis => { + ORGANISM => 'Arabidopsis thaliana', + # subsumed by TAIR file + }, + goa_chicken => { + ORGANISM => 'Gallus gallus', + COMMON_NAME => 'Chicken', + ORGANISM_DB_NAME => 'GOA @EBI', + ORGANISM_DB_URL => 'http://www.ebi.ac.uk/GOA/', + README_URL => 'http://www.geneontology.org/gene-associations/readme/goa.README', + IDENTIFIER_2 => 'UniProt_Accession (or Ensembl_ID)', + IDENTIFIER_3 => 'UniProt_ID (or Ensembl_ID)', + IDENTIFIER_11 => 'International Protein Index', + SAMPLE_GENE_LIST => '/sampleGeneFiles/goa_chicken.txt', + CONVERSION_TOOL => $uniprot, + ANNOTATION_FILE => 'gene_association.goa_chicken', + MENU_LABEL => 'goa_chicken' + }, + goa_cow => { + ORGANISM => 'Bos taurus', + COMMON_NAME => 'Cow', + ORGANISM_DB_NAME => 'GOA @EBI', + ORGANISM_DB_URL => 'http://www.ebi.ac.uk/GOA/', + README_URL => 'http://www.geneontology.org/gene-associations/readme/goa.README', + IDENTIFIER_2 => 'UniProt_Accession (or Ensembl_ID)', + IDENTIFIER_3 => 'UniProt_ID (or Ensembl_ID)', + IDENTIFIER_11 => 'International Protein Index', + SAMPLE_GENE_LIST => '/sampleGeneFiles/goa_cow.txt', + CONVERSION_TOOL => $uniprot, + ANNOTATION_FILE => 'gene_association.goa_cow', + MENU_LABEL => 'goa_cow' + }, + goa_Ecoli => { + ORGANISM => 'Escherichia coli', + COMMON_NAME => 'Bacterium coli', + ORGANISM_DB_NAME => 'GOA @EBI', + ORGANISM_DB_URL => 'http://www.ebi.ac.uk/GOA/', + README_URL => 'http://www.geneontology.org/gene-associations/readme/goa.README', + IDENTIFIER_2 => 'UniProt_Accession (or Ensembl_ID)', + IDENTIFIER_3 => 'UniProt_ID (or Ensembl_ID)', + IDENTIFIER_11 => 'International Protein Index', + CONVERSION_TOOL => $uniprot, + ANNOTATION_FILE => 'gene_association.goa_Ecoli', + MENU_LABEL => 'goa_ecoli' + }, + goa_human => { + GENE_URL => 'http://www.ensembl.org/Homo_sapiens/geneview?gene=', + COMMON_NAME => 'Human', + ORGANISM => 'Homo sapiens', + ORGANISM_DB_NAME => 'GOA @EBI', + ORGANISM_DB_URL => 'http://www.ebi.ac.uk/GOA/', + DEFAULT_GENE_URL => 'http://www.ensembl.org/Homo_sapiens/geneview?gene='.'ENSG00000039600&db'.'=core', + README_URL => 'http://www.geneontology.org/gene-associations/readme/goa.README', + IDENTIFIER_2 => 'UniProt_Accession (or Ensembl_ID)', + IDENTIFIER_3 => 'UniProt_ID (or Ensembl_ID)', + IDENTIFIER_11 => 'International Protein Index', + IDENTIFIER_2_EXAMPLE => 'O94993', + IDENTIFIER_3_EXAMPLE => 'SX30_HUMAN', + IDENTIFIER_11_EXAMPLE => 'IPI00016680', + SAMPLE_GENE_LIST => '/sampleGeneFiles/goa_human.txt', + CONVERSION_TOOL => $uniprot, + ANNOTATION_FILE => 'gene_association.goa_human', + MENU_LABEL => 'goa_human', + SLIM_MENU_LABEL => 'goa_human (GOA GO slim)' + }, + goa_human_generic => {ANNOTATION_FILE => 'gene_association.goa_human_generic', + SLIM_MENU_LABEL => 'goa_human (Generic GO slim)' + }, + goa_human_hgnc => { + # augmented goa_human +# GENE_URL => 'http://www.ensembl.org/Homo_sapiens/geneview?gene=', + GENE_URL => 'http://www.genenames.org/data/hgnc_data.php?hgnc_id=', + COMMON_NAME => 'Human', + ORGANISM => 'Homo sapiens', + ORGANISM_DB_NAME => 'GOA @EBI + XREFs', + ORGANISM_DB_URL => 'http://www.ebi.ac.uk/GOA/', + DEFAULT_GENE_URL => 'http://www.ensembl.org/Homo_sapiens/geneview?gene='.'ENSG00000039600&db'.'=core', + README_URL => 'http://www.geneontology.org/gene-associations/readme/goa.README', + IDENTIFIER_2 => 'UniProt_Accession (or Ensembl_ID)', + IDENTIFIER_3 => 'UniProt_ID (or Ensembl_ID)', + IDENTIFIER_11 => 'International Protein Index with additional crossreferenced gene symbols', + IDENTIFIER_2_EXAMPLE => 'O94993', + IDENTIFIER_3_EXAMPLE => 'SX30_HUMAN', + IDENTIFIER_11_EXAMPLE => 'IPI00016680|SOX30', + SAMPLE_GENE_LIST => '/sampleGeneFiles/goa_human_hgnc.txt', + CONVERSION_TOOL => $uniprot, + ANNOTATION_FILE => 'gene_association.goa_human_hgnc', + MENU_LABEL => 'goa_human_hgnc', + SLIM_MENU_LABEL => 'goa_human_hgnc (GOA GO slim)' + }, + goa_human_hgnc_generic => {ANNOTATION_FILE => 'gene_association.goa_human_hgnc_generic', + SLIM_MENU_LABEL => 'goa_human_hgnc (Generic GO slim)' + }, + goa_human_ensembl => { + # augmented goa_human + GENE_URL => 'http://www.ensembl.org/Homo_sapiens/geneview?gene=', + COMMON_NAME => 'Human', + ORGANISM => 'Homo sapiens', + ORGANISM_DB_NAME => 'GOA @EBI + Ensembl', + ORGANISM_DB_URL => 'http://www.ebi.ac.uk/GOA/', + DEFAULT_GENE_URL => 'http://www.ensembl.org/Homo_sapiens/geneview?gene='.'ENSG00000039600&db'.'=core', + README_URL => 'http://www.geneontology.org/gene-associations/readme/goa.README', + IDENTIFIER_2 => 'UniProt_Accession (or Ensembl_ID)', + IDENTIFIER_3 => 'UniProt_ID (or Ensembl_ID)', + IDENTIFIER_11 => 'International Protein Index with additional crossreferenced gene symbols', + IDENTIFIER_2_EXAMPLE => 'O94993', + IDENTIFIER_3_EXAMPLE => 'SX30_HUMAN', + IDENTIFIER_11_EXAMPLE => 'IPI00016680|SOX30', + SAMPLE_GENE_LIST => '/sampleGeneFiles/goa_human_ensembl.txt', + CONVERSION_TOOL => $uniprot, + ANNOTATION_FILE => 'gene_association.goa_human_ensembl', + MENU_LABEL => 'goa_human_ensembl', + SLIM_MENU_LABEL => 'goa_human_ensembl (GOA GO slim)' + }, + goa_human_ensembl_generic => {ANNOTATION_FILE => 'gene_association.goa_human_ensembl_generic', + SLIM_MENU_LABEL => 'goa_human_ensembl (Generic GO slim)' + }, + # # subsumed by RGD? + # GENE_URL => 'http://www.ensembl.org/Mus_musculus/geneview?gene=', + # COMMON_NAME => 'Mouse', + # ORGANISM => 'Mus musculus', + # ORGANISM_DB_NAME => 'GOA @EBI', + # ORGANISM_DB_URL => 'http://www.ebi.ac.uk/GOA/', + # DEFAULT_GENE_URL => 'http://www.ensembl.org/Mus_musculus/geneview?gene='.'ENSMUSG00000055141&db'.'=core', + # README_URL => 'http://www.geneontology.org/gene-associations/readme/goa.README', + # IDENTIFIER_2 => 'UniProt_Accession (or Ensembl_ID)', + # IDENTIFIER_3 => 'UniProt_Id (or Ensembl_ID)', + # IDENTIFIER_11 => 'International Protein Index', + # IDENTIFIER_2_EXAMPLE => 'P07628', + # IDENTIFIER_3_EXAMPLE => 'KLK8_MOUSE', + # IDENTIFIER_11_EXAMPLE => 'IPI00130801', + # SAMPLE_GENE_LIST => '/sampleGeneFiles/goa_mouse.txt', + # CONVERSION_TOOL => $uniprot, + # ANNOTATION_FILE => 'gene_association.goa_mouse', + # }, +# goa_mouse_generic => {ANNOTATION_FILE => 'gene_association.goa_mouse_generic', +# SLIM_MENU_LABEL => 'goa_mouse (Generic GO slim)' +# }, +# goa_pdb => { +# # UNUSED??? +# GENE_URL => 'http://www.rcsb.org/pdb/cgi/explore.cgi?pdbId=', +# COMMON_NAME => '', +# ORGANISM => 'Protein Data Bank', +# ORGANISM_DB_NAME => 'PDB', +# ORGANISM_DB_URL => 'http://www.rcsb.org/pdb/index.html', +# DEFAULT_GENE_URL => 'http://www.rcsb.org/pdb/cgi/explore.cgi?pdbId'.'=104M', +# README_URL => 'http://www.geneontology.org/gene-associations/readme/goa_pdb.README', +# IDENTIFIER_2 => 'PDB_ID', +# IDENTIFIER_3 => '', +# IDENTIFIER_11 => '', +# IDENTIFIER_2_EXAMPLE => '104M', +# IDENTIFIER_3_EXAMPLE => '', +# IDENTIFIER_11_EXAMPLE => '', +# SAMPLE_GENE_LIST => '/sampleGeneFiles/goa_pdb.txt', +# ANNOTATION_FILE => 'gene_association.goa_pdb', +# MENU_LABEL => 'goa_pdb', +# SLIM_MENU_LABEL => 'goa_pdb (GOA GO slim)' +# }, +# goa_pdb_generic => {ANNOTATION_FILE => 'gene_association.goa_pdb_generic', +# SLIM_MENU_LABEL => 'goa_pdb (Generic GO slim)' +# }, + goa_rat => { + # subsumed by RGD? + GENE_URL => 'http://www.ensembl.org/Rattus_norvegicus/geneview?gene=', + COMMON_NAME => 'Rat', + ORGANISM => 'Rattus norvegicus', + ORGANISM_DB_NAME => 'GOA @EBI', + ORGANISM_DB_URL => 'http://www.ebi.ac.uk/GOA/', + DEFAULT_GENE_URL => 'http://www.ensembl.org/Rattus_norvegicus/geneview?gene'.'=ENSRNOG00000017002&db'.'=core', + README_URL => 'http://www.geneontology.org/gene-associations/readme/goa.README', + IDENTIFIER_2 => 'UniProt_Accession (or Ensembl_ID)', + IDENTIFIER_3 => 'UniProt_ID (or Ensembl_ID)', + IDENTIFIER_11 => 'International Protein Index', + IDENTIFIER_2_EXAMPLE => 'P18090', + IDENTIFIER_3_EXAMPLE => 'B1AR_RAT', + IDENTIFIER_11_EXAMPLE => 'IPI00188184', + SAMPLE_GENE_LIST => '/sampleGeneFiles/goa_rat.txt', + CONVERSION_TOOL => $uniprot, + ANNOTATION_FILE => 'gene_association.goa_rat', + }, +# goa_rat_generic => {ANNOTATION_FILE => 'gene_association.goa_rat_generic', +# SLIM_MENU_LABEL => 'goa_rat (Generic GO slim)' +# }, + goa_uniprot => { + GENE_URL => undef, + ORGANISM => 'undef', + ANNOTATION_FILE => 'gene_association.goa_uniprot', +# UNUSED??? +# MENU_LABEL => 'goa_uniprot', +# SLIM_MENU_LABEL => 'goa_unipro (Generic GO slim)' + }, + goa_zebrafish => { + ORGANISM => 'Danio rerio', + # subsumed by ZFIN file + }, + gramene_oryza => { + GENE_URL => 'http://www.gramene.org/perl/protein_search?acc=', + COMMON_NAME => 'Rice', + ORGANISM => 'Oryza sativa', + ORGANISM_DB_NAME => 'Gramene', + ORGANISM_DB_URL => 'http://www.gramene.org/', + DEFAULT_GENE_URL => 'http://www.gramene.org/perl/protein_search?acc'.'=Q94HJ1', + README_URL => 'http://www.geneontology.org/gene-associations/readme/gramene_oryza.README', + IDENTIFIER_2 => 'Swiss-Prot/TrEMBL_ID', + IDENTIFIER_3 => 'Gene Name/Symbol', + IDENTIFIER_11 => '', + IDENTIFIER_2_EXAMPLE => 'Q94HJ1', + IDENTIFIER_3_EXAMPLE => 'OJ991113_30.2', + IDENTIFIER_11_EXAMPLE => '', + SAMPLE_GENE_LIST => '/sampleGeneFiles/gramene_oryza.txt', + ANNOTATION_FILE => 'gene_association.gramene_oryza', + MENU_LABEL => 'Gramene (Rice - O. sativa)', + SLIM_MENU_LABEL => 'Gramene (Generic GO slim)' + }, + mgi => { + GENE_URL => 'http://www.informatics.jax.org/searches/accession_report.cgi?id='.'', + COMMON_NAME => 'Mouse', + ORGANISM => 'Mus musculus', + ORGANISM_DB_NAME => 'MGI', + ORGANISM_DB_URL => 'http://www.informatics.jax.org/', + DEFAULT_GENE_URL => 'http://www.informatics.jax.org/searches/accession_report.cgi?id'.'=MGI:1889008', + README_URL => 'http://www.geneontology.org/gene-associations/readme/mgi.README', + IDENTIFIER_2 => 'MGI_ID', + IDENTIFIER_3 => 'Gene Symbol', + IDENTIFIER_11 => 'Gene_Symbol (old)', + IDENTIFIER_2_EXAMPLE => 'MGI:1889008', + IDENTIFIER_3_EXAMPLE => 'Atp2c1', + IDENTIFIER_11_EXAMPLE => '1700121J11Rik', + SAMPLE_GENE_LIST => '/sampleGeneFiles/mgi.txt', + ANNOTATION_FILE => 'gene_association.mgi', + MENU_LABEL => 'MGI (Mouse - M. musculus)', + SLIM_MENU_LABEL => 'MGI (Generic GO slim)' + }, + pseudocap => { + GENE_URL => 'http://www.pseudomonas.com/AnnotationByPAU.asp?PA=', + COMMON_NAME => 'Pseudomonas', + ORGANISM => 'Pseudomonas aeruginosa PAO1', + ORGANISM_DB_NAME => 'PseudoCAP', + ORGANISM_DB_URL => 'http://v2.pseudomonas.com/', + DEFAULT_GENE_URL => 'http://www.pseudomonas.com/AnnotationByPAU.asp?PA=PA4765', + README_URL => '', + IDENTIFIER_2 => 'PA#', + IDENTIFIER_3 => 'Gene Name', + IDENTIFIER_11 => 'Alt. Gene Name (opt.)', + IDENTIFIER_2_EXAMPLE => 'PA4765', + IDENTIFIER_3_EXAMPLE => 'omlA', + IDENTIFIER_11_EXAMPLE => 'oprX', + SAMPLE_GENE_LIST => '/sampleGeneFiles/pseudocap.txt', + ANNOTATION_FILE => 'gene_association.pseudocap', + MENU_LABEL => 'PseudoCAP (P. aeruginosa)', + SLIM_MENU_LABEL => 'PseudoCAP (Generic GO slim)' + }, + rgd => { + GENE_URL => 'http://rgd.mcw.edu/tools/genes/genes_view.cgi?id=', + COMMON_NAME => 'Rat', + ORGANISM => 'Rattus norvegicus', + ORGANISM_DB_NAME => 'RGD', + ORGANISM_DB_URL => 'http://rgd.mcw.edu/', + DEFAULT_GENE_URL => 'http://rgd.mcw.edu/tools/genes/genes_view.cgi?id'.'=RGD:3696', + README_URL => 'http://www.geneontology.org/gene-associations/readme/rgd.README', + IDENTIFIER_2 => 'RGD_ID (or Ensembl Id, or UniProt accession)', + IDENTIFIER_3 => 'Gene Symbol (or UniProt Entry Name)', + IDENTIFIER_11 => 'if GOA-provided, an International Protein Index identifier', + IDENTIFIER_2_EXAMPLE => 'RGD:3696', + IDENTIFIER_3_EXAMPLE => 'Slc1a1', + IDENTIFIER_11_EXAMPLE => 'EAAC1', + SAMPLE_GENE_LIST => '/sampleGeneFiles/rgd.txt', + ANNOTATION_FILE => 'gene_association.rgd', + MENU_LABEL => 'RGD (Rat - R. novegicus)', + SLIM_MENU_LABEL => 'RGD (Generic GO slim)' + }, +# rgd_synonyms => { +# # augmented rgd +# GENE_URL => 'http://rgd.mcw.edu/tools/genes/genes_view.cgi?id=', +# COMMON_NAME => 'Rat', +# ORGANISM => 'Rattus norvegicus', +# ORGANISM_DB_NAME => 'RGD + XREFS', +# ORGANISM_DB_URL => 'http://rgd.mcw.edu/', +# DEFAULT_GENE_URL => 'http://rgd.mcw.edu/tools/genes/genes_view.cgi?id'.'=RGD:3696', +# README_URL => 'http://www.geneontology.org/gene-associations/readme/rgd.README', +# IDENTIFIER_2 => 'RGD_ID (or Ensembl Id, or UniProt accession)', +# IDENTIFIER_3 => 'Gene Symbol (or UniProt Entry Name)', +# IDENTIFIER_11 => 'International Protein Index identifier, additional synonyms', +# IDENTIFIER_2_EXAMPLE => 'RGD:3696', +# IDENTIFIER_3_EXAMPLE => 'Slc1a1', +# IDENTIFIER_11_EXAMPLE => 'EAAC1', +# SAMPLE_GENE_LIST => '/sampleGeneFiles/rgd_synonyms.txt', +# ANNOTATION_FILE => 'gene_association.rgd_synonyms', +# MENU_LABEL => 'RGD with synonyms (Rat - R. novegicus)', +# SLIM_MENU_LABEL => 'RGD with synonyms (Generic GO slim)' +# }, + sgd => { + GENE_URL => 'https://www.yeastgenome.org/locus/', + COMMON_NAME => 'Yeast', + ORGANISM => 'Saccharomyces cerevisiae', + ORGANISM_DB_NAME => 'SGD', + ORGANISM_DB_URL => 'https://www.yeastgenome.org/', + DEFAULT_GENE_URL => 'https://www.yeastgenome.org/locus/YPL250C', + README_URL => 'http://www.geneontology.org/gene-associations/readme/sgd.README', + IDENTIFIER_2 => 'SGD_ID', + IDENTIFIER_3 => 'Gene Name', + IDENTIFIER_11 => 'Systematic ORF Name', + IDENTIFIER_2_EXAMPLE => 'S0006171', + IDENTIFIER_3_EXAMPLE => 'ICY2', + IDENTIFIER_11_EXAMPLE => 'YPL250C', + SAMPLE_GENE_LIST => '/sampleGeneFiles/sgd.txt', + ANNOTATION_FILE => 'gene_association.sgd', + MENU_LABEL => 'SGD (Yeast - S. cerevisiae)', + SLIM_MENU_LABEL => 'SGD (Yeast - S. cerevisiae GO slim)' + }, + sgd_generic => { + ANNOTATION_FILE => 'gene_association.sgd_generic', + SLIM_MENU_LABEL => 'SGD (Generic GO slim)' + }, + tair => { +# GENE_URL => 'http://www.arabidopsis.org/servlets/TairObject?type=gene&name=', + GENE_URL => 'http://www.arabidopsis.org/servlets/Search?type=general&search_action=detail&method=1&show_obsolete=F&sub_type=gene&SEARCH_EXACT=4&SEARCH_CONTAINS=1&name=', + COMMON_NAME => 'Common wallcress', + ORGANISM => 'Arabidopsis thaliana', + ORGANISM_DB_NAME => 'TAIR', + ORGANISM_DB_URL => 'http://www.arabidopsis.org/', +# DEFAULT_GENE_URL => 'http://www.arabidopsis.org/servlets/TairObject?type'.'=gene&name'.'=AT3G52270.1', + DEFAULT_GENE_URL => 'http://www.arabidopsis.org/servlets/Search?type=general&search_action=detail&method=1&show_obsolete=F&name=AT1G57980&sub_type=gene&SEARCH_EXACT=4&SEARCH_CONTAINS=1', + README_URL => 'http://www.geneontology.org/gene-associations/readme/tair.README', + IDENTIFIER_2 => 'TAIR Accession', + IDENTIFIER_3 => 'Gene Name', + IDENTIFIER_11 => 'Gene Alias', + IDENTIFIER_2_EXAMPLE => 'gene:2100553', + IDENTIFIER_3_EXAMPLE => 'AT3G52270.1', + IDENTIFIER_11_EXAMPLE => 'T25B15.40', + SAMPLE_GENE_LIST => '/sampleGeneFiles/tair.txt', + ANNOTATION_FILE => 'gene_association.tair', + MENU_LABEL => 'TAIR (Common wallcress - A. thaliana)', + SLIM_MENU_LABEL => 'TAIR (Generic GO slim)' + }, + jcvi_Banthracis => { +# GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus=', + COMMON_NAME => '', + ORGANISM => 'Bacillus anthracis', + ORGANISM_DB_NAME => 'JCVI anthracis', +# ORGANISM_DB_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenomePage3.spl?database'.'=gba', +# DEFAULT_GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus='.'BA0001', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + IDENTIFIER_2 => 'JCVI Locus Name', + IDENTIFIER_3 => '', + IDENTIFIER_11 => 'Gene Symbol', + IDENTIFIER_2_EXAMPLE => 'BA0001', + IDENTIFIER_3_EXAMPLE => '', + IDENTIFIER_11_EXAMPLE => 'dnaA', + SAMPLE_GENE_LIST => '/sampleGeneFiles/jcvi_anthracis.txt', + ANNOTATION_FILE => 'gene_association.jcvi_Banthracis', + MENU_LABEL => 'JCVI_Banthracis (B. anthracis)', + SLIM_MENU_LABEL => 'JCVI_Banthracis (Generic GO slim)' + }, + jcvi_Cburnetii => { +# GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus=', + COMMON_NAME => '', + ORGANISM => 'Coxiella burnetii', + ORGANISM_DB_NAME => 'JCVI coxiella', +# ORGANISM_DB_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenomePage3.spl?database'.'=gcb', +# DEFAULT_GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus='.'CBU0001', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + IDENTIFIER_2 => 'JCVI Locus Name', + IDENTIFIER_3 => '', + IDENTIFIER_11 => 'Gene Symbol', + IDENTIFIER_2_EXAMPLE => 'CBU0001', + IDENTIFIER_3_EXAMPLE => '', + IDENTIFIER_11_EXAMPLE => 'dnaA', + SAMPLE_GENE_LIST => '/sampleGeneFiles/jcvi_coxiella.txt', + ANNOTATION_FILE => 'gene_association.jcvi_Cburnetii', + MENU_LABEL => 'JCVI_Cburnetii (C. burnetii)', + SLIM_MENU_LABEL => 'JCVI_Cburnetii (Generic GO slim)' + }, + jcvi_Cjejuni => { + ORGANISM => 'Campylobacter jejuni', + COMMON_NAME => '', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + ANNOTATION_FILE => 'gene_association.jcvi_Cjejuni', + MENU_LABEL => 'JCVI_Cjejuni (C. jejuni)', + SLIM_MENU_LABEL => 'JCVI_Cjejuni (Generic GO slim)' + }, + jcvi_Dethenogenes => { + ORGANISM => 'Dehalococcoides ethenogenes', + COMMON_NAME => '', + ANNOTATION_FILE => 'gene_association.jcvi_Dethenogenes', + MENU_LABEL => 'JCVI_Dethenogenes (D. ethenogenes)', + SLIM_MENU_LABEL => 'JCVI_Dethenogenes (Generic GO slim)' + }, + jcvi_Gsulfurreducens => { +# GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus=', + COMMON_NAME => 'Geobacter', + ORGANISM => 'Geobacter sulfurreducens PCA', + ORGANISM_DB_NAME => 'JCVI Gsulfurreducens', +# ORGANISM_DB_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenomePage3.spl?database'.'=ggs', +# DEFAULT_GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus'.'=GSU0001', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + IDENTIFIER_2 => 'JCVI Locus Name', + IDENTIFIER_3 => '', + IDENTIFIER_11 => 'Gene Symbol', + IDENTIFIER_2_EXAMPLE => 'GSU_0001', + IDENTIFIER_3_EXAMPLE => '', + IDENTIFIER_11_EXAMPLE => 'dnaN', + SAMPLE_GENE_LIST => '/sampleGeneFiles/jcvi_gsulfurreducens.txt', + ANNOTATION_FILE => 'gene_association.jcvi_Gsulfurreducens', + MENU_LABEL => 'JCVI_Gsulfurreducens (G. sulfurreducens PCA)', + SLIM_MENU_LABEL => 'JCVI_Gsulfurreducens (Generic GO slim)' + }, + jcvi_Lmonocytogenes => { +# GENE_URL => undef, + ORGANISM => 'Listeria monocytogenes', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + ANNOTATION_FILE => 'gene_association.jcvi_Lmonocytogenes', + MENU_LABEL => 'JCVI_Lmonocytogenes (L. monocytogenes)', + SLIM_MENU_LABEL => 'JCVI_Lmonocytogenes (Generic GO slim)' + }, + jcvi_Mcapsulatus => { + ORGANISM => 'Methylococcus capsulatus', + COMMON_NAME => '', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + ANNOTATION_FILE => 'gene_association.jcvi_Mcapsulatus', + MENU_LABEL => 'JCVI_Mcapsulatus (M. capsulatus)', + SLIM_MENU_LABEL => 'JCVI_Mcapsulatus (Generic GO slim)' + }, + jcvi_Psyringae => { +# GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus=', + COMMON_NAME => '', + ORGANISM => 'Pseudomonas syringae', + ORGANISM_DB_NAME => 'JCVI Psyringae', +# ORGANISM_DB_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenomePage3.spl?database'.'=gps', +# DEFAULT_GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus'.'=PSPTO5598', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + IDENTIFIER_2 => 'JCVI Locus Name', + IDENTIFIER_3 => '', + IDENTIFIER_11 => 'Gene Symbol', + IDENTIFIER_2_EXAMPLE => 'PSPTO5598', + IDENTIFIER_3_EXAMPLE => '', + IDENTIFIER_11_EXAMPLE => 'atpC', + SAMPLE_GENE_LIST => '/sampleGeneFiles/jcvi_psyringae.txt', + ANNOTATION_FILE => 'gene_association.jcvi_Psyringae', + MENU_LABEL => 'JCVI_Psyringae (P. syringae)', + SLIM_MENU_LABEL => 'JCVI_Psyringae (Generic GO slim)' + }, + jcvi_Soneidensis => { +# GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus=', + COMMON_NAME => '', + ORGANISM => 'Shewanella oneidensis', + ORGANISM_DB_NAME => 'JCVI shewanella', +# ORGANISM_DB_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenomePage3.spl?database'.'=gsp', +# DEFAULT_GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus'.'=SO0001', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + IDENTIFIER_2 => 'JCVI Locus Name', + IDENTIFIER_3 => '', + IDENTIFIER_11 => 'Gene Symbol', + IDENTIFIER_2_EXAMPLE => 'SO0001', + IDENTIFIER_3_EXAMPLE => '', + IDENTIFIER_11_EXAMPLE => 'mioC', + SAMPLE_GENE_LIST => '/sampleGeneFiles/jcvi_shewannella.txt', + ANNOTATION_FILE => 'gene_association.jcvi_Soneidensis', + MENU_LABEL => 'JCVI_Soneidensis (S. oneidensis)', + SLIM_MENU_LABEL => 'JCVI_Soneidensis (Generic GO slim)' + }, + jcvi_Spomeroyi => { + ORGANISM => 'Silicibacter pomeroyi', + COMMON_NAME => '', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + ANNOTATION_FILE => 'gene_association.jcvi_Spomeroyi', + MENU_LABEL => 'JCVI_Spomeroyi (S. pomeroyi)', + SLIM_MENU_LABEL => 'JCVI_Spomeroyi (Generic GO slim)' + }, +# jcvi_Tbrucei_chr2 => { +## GENE_URL => 'http://www.jcvi.org/tigr-scripts/euk_manatee/shared/ORF_infopage.cgi?db=tba1&;orf=', +# COMMON_NAME => '', +# ORGANISM => 'Trypanosoma brucei', +# ORGANISM_DB_NAME => 'JCVI Tbrucei chr2', +## ORGANISM_DB_URL => 'http://www.jcvi.org/tdb/e2k1/tba1/', +## DEFAULT_GENE_URL => 'http://www.jcvi.org/tigr-scripts/euk_manatee/shared/ORF_infopage.cgi?db'.'=tba1&;orf'.'=Tb927.2.5850', +# README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', +# IDENTIFIER_2 => 'JCVI Locus Name', +# IDENTIFIER_3 => 'JCVI Locus Name', +# IDENTIFIER_11 => '', +# IDENTIFIER_2_EXAMPLE => 'Tb927.2.5850', +# IDENTIFIER_3_EXAMPLE => '1F7.295', +# IDENTIFIER_11_EXAMPLE => '', +# SAMPLE_GENE_LIST => '/sampleGeneFiles/jcvi_tbrucei_chr2.txt', +# ANNOTATION_FILE => 'gene_association.jcvi_Tbrucei_chr2', +# MENU_LABEL => 'JCVI_Tbrucei_chr2 (T. brucei)', +# SLIM_MENU_LABEL => 'JCVI_Tbrucei_chr2 (Generic GO slim)' +# }, + jcvi_Vcholerae => { +# GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus=', + COMMON_NAME => 'Cholera spirillum', + ORGANISM => 'Vibrio cholerae', + ORGANISM_DB_NAME => 'JCVI vibrio', +# ORGANISM_DB_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenomePage3.spl?database'.'=gvc', +# DEFAULT_GENE_URL => 'http://www.jcvi.org/tigr-scripts/CMR2/GenePage.spl?locus'.'=VC0112', + README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', + IDENTIFIER_2 => 'JCVI Locus Name', + IDENTIFIER_3 => '', + IDENTIFIER_11 => 'Gene Symbol', + IDENTIFIER_2_EXAMPLE => 'VC0112', + IDENTIFIER_3_EXAMPLE => '', + IDENTIFIER_11_EXAMPLE => 'cycA', + SAMPLE_GENE_LIST => '/sampleGeneFiles/jcvi_vibrio.txt', + ANNOTATION_FILE => 'gene_association.jcvi_Vcholerae', + MENU_LABEL => 'JCVI_Vcholerae (V. cholerae)', + SLIM_MENU_LABEL => 'JCVI_Vcholerae (Generic GO slim)' + }, +# jcvi_gene_index => { +## GENE_URL => undef, +# COMMON_NAME => 'Gene Index', +# ORGANISM => '', +# ORGANISM_DB_NAME => 'JCVI gene index', +## ORGANISM_DB_URL => 'http://www.jcvi.org/tdb/tgi/index.shtml', +## DEFAULT_GENE_URL => '', +# README_URL => 'http://www.geneontology.org/gene-associations/readme/jcvi_prokaryotic.README', +# IDENTIFIER_2 => 'JCVI_TGI_ID', +# IDENTIFIER_3 => '', +# IDENTIFIER_11 => '', +# IDENTIFIER_2_EXAMPLE => 'Arabidopsis_TC171812', +# IDENTIFIER_3_EXAMPLE => '', +# IDENTIFIER_11_EXAMPLE => '', +# SAMPLE_GENE_LIST => '/sampleGeneFiles/jcvi_gene_index.txt', +# ANNOTATION_FILE => 'gene_association.jcvi_gene_index', +# MENU_LABEL => 'JCVI_gene_index', +# SLIM_MENU_LABEL => 'JCVI_gene_index (Generic GO slim)' +# }, + wb => { + GENE_URL => 'http://www.wormbase.org/db/gene/gene?name=', + COMMON_NAME => 'Worm', + ORGANISM => 'Caenorhabditis elegans', + ORGANISM_DB_NAME => 'WormBase', + ORGANISM_DB_URL => 'http://www.wormbase.org/', + DEFAULT_GENE_URL => 'http://www.wormbase.org/db/gene/gene?name'.'=nhr-10', + README_URL => 'http://www.geneontology.org/gene-associations/readme/WormBase.README', + IDENTIFIER_2 => 'Protein Name', + IDENTIFIER_3 => 'Gene Name', + IDENTIFIER_11 => 'Gene Symbol', + IDENTIFIER_2_EXAMPLE => 'CE00815', + IDENTIFIER_3_EXAMPLE => 'nhr-10', + IDENTIFIER_11_EXAMPLE => 'B0280.8', + SAMPLE_GENE_LIST => '/sampleGeneFiles/wb.txt', + ANNOTATION_FILE => 'gene_association.wb', + MENU_LABEL => 'WormBase (Worm - C. elegans)', + SLIM_MENU_LABEL => 'WormBase (Generic GO slim)' + }, + zfin => { + GENE_URL => 'http://zfin.org/cgi-bin/webdriver?MIval=aa-markerview.apg&OID=', + COMMON_NAME => 'Zebrafish', + ORGANISM => 'Danio rerio', + ORGANISM_DB_NAME => 'ZFIN', + ORGANISM_DB_URL => 'http://zfin.org/cgi-bin/webdriver?MIval'.'=aa-ZDB_home.apg', + DEFAULT_GENE_URL => 'http://zfin.org/cgi-bin/webdriver?MIval'.'=aa-markerview.apg&OID'.'=ZDB-GENE-000125-12', + README_URL => 'http://www.geneontology.org/gene-associations/readme/zfin.README', + IDENTIFIER_2 => 'ZFIN_ID', + IDENTIFIER_3 => 'Gene Symbol', + IDENTIFIER_11 => '', + IDENTIFIER_2_EXAMPLE => 'ZDB-GENE-000125-12', + IDENTIFIER_3_EXAMPLE => 'igfbp2', + IDENTIFIER_11_EXAMPLE => '', + SAMPLE_GENE_LIST => '/sampleGeneFiles/zfin.txt', + ANNOTATION_FILE => 'gene_association.zfin', + MENU_LABEL => 'ZFIN (Zebrafish - D. rerio)', + SLIM_MENU_LABEL => 'ZFIN (Generic GO slim)' + } + ); + + +############################################################################## +# CLASS PRIVATE METHODS +# +############################################################################## +sub _annotationFileSuffix { +############################################################################## +# This class private method return gene annotation file suffix for the supplied +# gene annotation file. + + my ($self, $annotationFile) = @_; + + if ($annotationFile) { + my @annotationFileSuffix = split (/\./, $annotationFile); + my $annotationFileSuffix = $annotationFileSuffix[$#annotationFileSuffix]; + + return ($annotationFileSuffix); + } +} + + +############################################################################## +# CLASS PUBLIC METHODS +# +############################################################################## +sub new { +############################################################################## +=pod + +=head1 Constructor + +=head2 new + +This is the constructor for an MetaData object. + +Usage: + + my $metaData = GO::AnnotationProvider::MetaData->new(); + +=cut + + my ($class) = @_; + #---Create a new object + my $self = bless { }, $class; + + return $self; +} + + +=pod + +=head1 Public Instance Methods + +=cut + +############################################################################## +sub AUTOLOAD { +############################################################################## +=pod + +=head2 AUTOLOAD + +This public AUTOLOAD method is used to get attributes and set attributes for an +organism's gene annotation file for the supplied gene annotation file. + +Usage: + + my $annotationFile = 'gene_association.sgd'; + + #---Get this organism's annotation file extension (i.e. suffix) + my @annotationFileSuffix = split (/\./, $annotationFile); + my $annotationFileSuffix = $annotationFileSuffix[$#annotationFileSuffix]; + + #MUTATOR METHODS + #---Call this MetaData.pm's set method to set the value for ANNOTATION_FILE for this organism + $metaData->setANNOTATION_FILE($file, $annotationFile); + #---Call this MetaData.pm's set method to set the value for ANNOTATE_GENE_NUM for this organism + $metaData->setANNOTATE_GENE_NUM($annotateGeneNum, $annotationFile); + #---Call this MetaData.pm's set method to set the value for ESTIMATE_GENE_NUM for this organism + $metaData->setESTIMATE_GENE_NUM($annotationFile); + + #ACCESSOR METHODS + #---Call this MetaData.pm's get method to get organism gene url + my ($organismGeneUrl) = $metaData->getGENE_URL($annotationFile); + #---Call this MetaData.pm's get method to get organism common name + my ($organismCommonName) = $metaData->getCOMMON_NAME($annotationFile); + #---Call this MetaData.pm's get method to get organism scientific name + my ($ogranismScientificName) = $metaData->getORGANISM($annotationFile); + #---Call this MetaData.pm's get method to get organism database name + my ($organismDatabaseName) = $metaData->getORGANISM_DB_NAME($annotationFile); + #---Call this MetaData.pm's get method to get organism database url + my ($organismDatabaseUrl) = $metaData->getORGANISM_DB_URL($annotationFile); + #---Call this MetaData.pm's get method to get all the attributes about this organism + my ($organismGeneAsscFileRef) = $metaData->getOrganismInfo($annotationFile); + +=cut + + my ($self, $annotationFile) = @_; + + #---Get the annotation file's file extension (i.e. suffix) + my $annotationFileSuffix = $self->_annotationFileSuffix($annotationFile); + #$DEBUG && print "annotationFileSuffix: $annotationFileSuffix\n"; + + #$DEBUG && print "AUTOLOAD is set to ", $AUTOLOAD, "\n"; + my ($getORsetMethod, $getORsetAttribute) = ( $AUTOLOAD =~ /(get|set)(\w+)/); + #$DEBUG && print "\$getORsetMethod: ", $getORsetMethod, " \$getORsetAttribute: ", $getORsetAttribute, "\n"; + + if ($getORsetMethod) { + #---AUTOLOAD accessor methods + if ($getORsetMethod eq 'get') { + + #AUTOLOAD accessor method to get all the class data attributes for this organism (e.g. getOrganismInfo) + if ($getORsetAttribute eq 'OrganismInfo') { + return ($_annotationFiles{ $annotationFileSuffix }); #This returns reference + }#if ($getORsetAttribute eq 'OrganismInfo') + + #AUTOLOAD accessor methods to get a specific class data attribute for this organism (e.g. getGENE_URL, getCOMMON_NAME, getORGANISM, getORGANISM_DB_NAME, getORGANISM_DB_URL) + else { + return ( $_annotationFiles{ $annotationFileSuffix }->{ $getORsetAttribute } ); #This returns value + } + }#end if ($getORsetMethod eq 'get') + + + #---AUTOLOAD mutator methods + elsif ($getORsetMethod eq 'set') { + + #AUTOLOAD mutator method to set value for ESTIMATE_GENE_NUM attribute (e.g. setESTIMATE_GENE_NUM) + if ($getORsetAttribute eq 'ESTIMATE_GENE_NUM') { + if ($_annotationFiles{ $annotationFileSuffix }->{ORGANISM} ) { + $_annotationFiles{ $annotationFileSuffix }->{$getORsetAttribute} = $organismEstimateTotalGeneNum { $_annotationFiles{ $annotationFileSuffix }->{ORGANISM} }; + } + else { + $_annotationFiles{ $annotationFileSuffix }->{$getORsetAttribute} = ''; + } + + $DEBUG && print "$getORsetAttribute: ", $_annotationFiles{ $annotationFileSuffix }->{$getORsetAttribute}, "\n"; + }#end if ($getORsetAttribute eq 'ESTIMATE_GENE_NUM') + + #AUTOLOAD mutator methods to set values for some of this class's data attributes to newValue (e.g. setANNOTATION_FILE, setANNOTATE_GENE_NUM) + else { + my ($self, $newValue, $annotationFile) = @_; + + #---Get the annotation file's file extension (i.e. suffix) + my $annotationFileSuffix = $self->_annotationFileSuffix($annotationFile); + $DEBUG && print "annotationFileSuffix: $annotationFileSuffix\n"; + + $_annotationFiles{ $annotationFileSuffix }->{$getORsetAttribute} = $newValue; + $DEBUG && print "$getORsetAttribute: ", $_annotationFiles{ $annotationFileSuffix }->{$getORsetAttribute}, "\n"; + + }#end else + + }#end elsif ($getORsetMethod eq 'set') + + else { croak "No such method: ", $AUTOLOAD, "\n"; } + } +} + +############################################################################## +sub createGeneAsscTable { +############################################################################## +=pod + +=head2 createGeneAsscTable + +This public method creates a html table of the gene annotation file(s) available at GO ftp site +for the supplied reference to an array of hash reference(s), whose key is the gene annotation +file suffix and whose value is the reference to all the attributes of the organims. The html +table is written to the specified filepath (2nd argument). + +An example of a html table of the gene annotation files created by this public method can +be viewed at http://go.princeton.edu/GOTermFinder/GOTermFinder_help.shtml#asscFile + +Usage: + + $metaData->createGeneAsscTable(\@organismsInfoArray, $filepath, $relativeUrl); + + +Usage Example: + + use GO::AnnotationProvider::AnnotationParser; + use go::AnnotationProvider::MetaData; + + + my $metaData = GO::AnnotationProvider::MetaData->new(); + my $Dir = '/usr/local/go/lib/GO/'; #Directory where the GO and gene annotation files downloaded from GO ftp site locate + my $relativeUrl = "/goFiles/"; # a URL relative to the DOCUMENT_ROOT, where can the associations file be found (or linked to). + + opendir (DIR, $Dir) or die("Error opening: $!"); + my @organismsInfoArray; + foreach my $file (readdir(DIR)) { + if ($file !~ m/ontology/) { + my $annotationFilePath = $Dir.$file; + + # Get this organism's total annotated gene number + my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(annotationFile=>$annotationFilePath); + my @databaseIds = $annotationParser->allDatabaseIds(); + my $annotateGeneNum = scalar(@databaseIds); + + #---Get this organism's annotation file extension (i.e. suffix) + my @annotationFileSuffix = split (/\./, $file); + my $annotationFileSuffix = $annotationFileSuffix[$#annotationFileSuffix]; + + # Call these mutator methods to set values of these attributes for this organism + $metaData->setANNOTATION_FILE($file, $annotationFileSuffix); + $metaData->setANNOTATE_GENE_NUM($annotateGeneNum, $annotationFileSuffix); + $metaData->setESTIMATE_GENE_NUM($annotationFileSuffix); + + # Call this accessor method to return a hash reference of all attributes for this organism + my ($organismInfo) = $metaData->getOrganismInfo($annotationFileSuffix); + # @organismsInfoArray contains hash references of attributes for all the organisms + push(@organismsInfoArray, $organismInfo); + } + } + + # Call this method to create an html table of all the gene association files + my $filepath = "/Genomics/curtal/www/go/html/GOTermMapper/MetaData.html"; + $metaData->createGeneAsscTable(\@organismsInfoArray, $filepath, $relativeUrl); + + closedir(DIR); + +=cut + + my ($self, $organismGeneAsscFilesArrayRef, $filepath, $relativeUrl) = @_; + + #dereference referenced variables + my @organismGeneAsscFilesArray = @$organismGeneAsscFilesArrayRef; + + my @rows; + + foreach my $geneAsscFile (@organismGeneAsscFilesArray) { + + my $commonName = $geneAsscFile->{COMMON_NAME}; + my $organism = $geneAsscFile->{ORGANISM}; + my $orgText = $organism; + if($commonName){$orgText = "$commonName - $orgText"; } + + my $organismDBName = $geneAsscFile->{ORGANISM_DB_NAME}; + + my $organismDBUrl = $geneAsscFile->{ORGANISM_DB_URL}; + my $organismDBLink = ""; + if ($organismDBUrl) { + $organismDBLink = a({-href=>$organismDBUrl}, b($organismDBName)); + } + + my $defaultGeneUrl = $geneAsscFile->{DEFAULT_GENE_URL}; + my $defaultGeneUrlLink = ""; + if ($defaultGeneUrl) { + $defaultGeneUrlLink = a({-href=>$defaultGeneUrl}, "Default gene URL"); + } + + my $annotationFile = $geneAsscFile->{ANNOTATION_FILE}; + my $annotationFileLink = ""; + if ($annotationFile) { + $annotationFileLink = a({-href=>$relativeUrl.$annotationFile}, $annotationFile); + } + + my $readMeUrl = $geneAsscFile->{README_URL}; + my $readMeUrlLink = ""; + if ($readMeUrl){ + $readMeUrlLink = a({-href=>$readMeUrl}, "README"); + } + + my $annotateGeneNum = $geneAsscFile->{ANNOTATE_GENE_NUM}; + my $estimateGeneNum = $geneAsscFile->{ESTIMATE_GENE_NUM} || ''; + my $identifier_2 = $geneAsscFile->{IDENTIFIER_2} || ''; + my $identifier_3 = $geneAsscFile->{IDENTIFIER_3} || ''; + my $identifier_11 = $geneAsscFile->{IDENTIFIER_11} || ''; + my $identifier_2_example = $geneAsscFile->{IDENTIFIER_2_EXAMPLE} || ''; + my $identifier_3_example = $geneAsscFile->{IDENTIFIER_3_EXAMPLE} || ''; + my $identifier_11_example = $geneAsscFile->{IDENTIFIER_11_EXAMPLE} || ''; + + my $sample_list = $geneAsscFile->{SAMPLE_GENE_LIST}; + my $sample_list_link = ""; + if ($sample_list) { + $sample_list_link = a({-href=>$sample_list}, "sample list"); + } + + my $conversionTool = $geneAsscFile->{CONVERSION_TOOL}; + my $conversionToolLink = ""; + if ($conversionTool ) { + # redefine if any were specified + foreach my $label (keys(%{$conversionTool})){ + $conversionToolLink .= li(a({-href=>$$conversionTool{$label}}, $label)); + } + $conversionToolLink = ul($conversionToolLink); + } + + if($annotateGeneNum > 0){ + my $row = + Tr({-bgcolor=>"#DEB887"}, + td({-align=>"left", -valign=>"top", -nowrap=>''}, '', join(br(), $orgText, $organismDBLink, $defaultGeneUrlLink, $annotationFileLink, $readMeUrlLink), ''), + td({-align=>"left", -valign=>"top", -nowrap=>''}, '', $annotateGeneNum , ''), + td({-align=>"left", -valign=>"top", -nowrap=>''}, '',$estimateGeneNum, ''), + td({-align=>"left", -valign=>"top", -nowrap=>''}, '', join(br(), $identifier_2, $identifier_3, $identifier_11), ''), + td({-align=>"left", -valign=>"top", -nowrap=>''}, '', join(br(), $identifier_2_example, $identifier_3_example, $identifier_11_example, $sample_list_link), ''), + td({-align=>"left", -valign=>"top"}, '', $conversionToolLink, '') + ); + +# print $row."\n"; + push(@rows,$row); + + }else{ + warn "$annotationFile has zero annotations (probably filtered or eliminated by GO consortium conventions)\n"; + } + + }#end foreach my $geneAsscFile (@organismGeneAsscFilesArray) + + #---Generate the requested gene association table file + open (HTML, ">$filepath") || die "count not open $filepath : $!"; + + print HTML "
\n", + "", + "\n", + "\n", + "\n", + "\n", + + "\n", + "\n", + "\n", + "\n", + "\n", + "Identifiers\n", + "\n", + "\n", + "\n"; + + print HTML "@rows"; + + print HTML "\n
Gene Association File Table
\n", + "\n", + "Organism, Gene Associations, and Authority<\a>\n", + "\n", + "Total Annotated Gene ProductsTotal Estimated Gene Products\n", +# "\n", +# "IdentifiersExample IDsIdentifier Conversion Tool(s)
\n", + "
\n"; + + close(HTML); + +} + +############################################################################## +sub geneAnnotationFilesMenu { +############################################################################## +=pod + +=head2 geneAnnotationFilesMenu + +This public method returns a reference to an array of gene annotation +files and a reference to a hash, whose keys are the gene annotation +files and whose value are gene annotation file labels. These returned +array and hash references can then be used to create GOTermFinder's +pull-down menu of gene annotation files. + +Usage: + + my ($geneAssValuesArrayRef, $geneAssLabelsHashRef) = $metaData->geneAnnotationFilesMenu; + +=cut + + my ($self) = @_; + + return( $self->_menuValuesAndLables( labelKey => 'MENU_LABEL' )); +} + + +############################################################################## +sub geneAnnotationSlimFilesMenu { +############################################################################## +=pod + +=head2 geneAnnotationSlimFilesMenu + +This public method returns a reference to an array of gene annotation +files and a reference to a hash, whose keys are the gene annotation +files and whose value are gene annotation file labels. These returned +array and hash references can then be used to create a GOTermMapper's +pull-down menu of gene annotation files. + +Usage: + + my ($geneAssValuesArrayRef, $geneAssLabelsHashRef) = $metaData->geneAnnotationSlimFilesMenu; + +=cut + + my ($self) = @_; + + return( $self->_menuValuesAndLables(labelKey => 'SLIM_MENU_LABEL') ); +} + + +############################################################################## +sub _menuValuesAndLables { +############################################################################## +# given a key (MENU_LABEL or SLIM_MENU_LABEL), this private method +# returns a 2 references suitable to construct a CGI popup menu. +# Returned, in order : +# 1) reference to a array of menu values +# 2) reference to a hash of value to label +# + + my $self = shift; + my %args = @_; + + my $labelKey = $args{'labelKey'}; + my $fileKey = 'ANNOTATION_FILE'; + + my $check_dir; + if ($args{'checkUsableDir'}) { + $check_dir = $args{'checkUsableDir'}; + } + + my (@values, %labels); + + foreach my $annotationFileSuffix ( keys(%_annotationFiles) ) { + next if( $_annotationFiles{ $annotationFileSuffix }->{UNUSABLE} ); + # if both a value and a label are defined, use them + if( $_annotationFiles{ $annotationFileSuffix }->{$fileKey} && + $_annotationFiles{ $annotationFileSuffix }->{$labelKey} ) { + + push(@values, $_annotationFiles{ $annotationFileSuffix }->{$fileKey}); + $labels{ $_annotationFiles{ $annotationFileSuffix }->{$fileKey} } = $_annotationFiles{ $annotationFileSuffix }->{$labelKey}; + + } + + } + + @values = sort(@values); + + return(\@values, \%labels); +} + + +############################################################################## +sub checkUnusable { +############################################################################## + my $self = shift; + my %args = @_; + + my $check_dir = $args{dir}; + return undef unless ($check_dir); + + my @unusables; + + my $n = 0; + foreach my $annotationFileSuffix ( keys(%_annotationFiles) ) { + # check if the file is usable, as indicated by readability for now + if( $_annotationFiles{ $annotationFileSuffix }->{ANNOTATION_FILE} ){ + if (! -r $check_dir."/".$_annotationFiles{ $annotationFileSuffix }->{ANNOTATION_FILE}) { + my %info; + $info{ANNOTATION_FILE_SUFFIX} = $annotationFileSuffix; + foreach my $key ("ORGANISM", "ORGANISM_DB_NAME", "MENU_LABEL") { + $info{$key} = $_annotationFiles{ $annotationFileSuffix }->{$key}; + } + $_annotationFiles{ $annotationFileSuffix }->{UNUSABLE} = 1; + $unusables[$n++] = \%info; + } else { + $_annotationFiles{ $annotationFileSuffix }->{UNUSABLE} = 0; + } + } + } + + return \@unusables; +} + + +############################################################################## +sub organismEstimateTotalGeneNum { +############################################################################## +=pod + +=head2 organismEstimateTotalGeneNum + +This public method returns an organism's estimate total gene number for the +supplied gene annotation file. + +Usage: + + my ($estimateGeneNum) = $metaData->organismEstimateTotalGeneNum($annotionFile); + +=cut + + my ($self, $annotationFile) = @_; + + #---Get the annotation file's file extension (i.e. suffix) + my $annotationFileSuffix = $self->_annotationFileSuffix($annotationFile); + my $organism = $_annotationFiles{ $annotationFileSuffix }->{ORGANISM}; + $DEBUG && print "annotationFileSuffix: $annotationFileSuffix, organism $organism\n"; + + #---Get TOTAL_NUM_GENES for the annotation file, if TOTAL_NUM_GENES is defined + my $totalNumGenes; + if (($organism) && (defined($organismEstimateTotalGeneNum { $organism }))) { + $totalNumGenes = $organismEstimateTotalGeneNum { $organism }; +# } +# else { +# $totalNumGenes = ''; + } + + return $totalNumGenes; +} + + +############################################################################## +sub organismGeneUrl { +############################################################################## +=pod + +=head2 organismGeneUrl + +This public method returns an organism's gene url for the supplied gene url specified in the +$opt_u variable. If no gene url specified in the $opt_u variable, the method returns the +organism's default gene url for the supplied gene annotation file. + +Usage: + + my $opt_u = 'http://genome-www4.stanford.edu/cgi-bin/SGD/locus.pl?locus='; + my ($geneUrl) = $metaData->organismGeneUrl($annotationFile, $opt_u); + +=cut + + my ($self, $annotationFile, $opt_u) = @_; + + #---Get the annotation file's file extension (i.e. suffix) + my $annotationFileSuffix = $self->_annotationFileSuffix($annotationFile); + $DEBUG && print "annotationFileSuffix: $annotationFileSuffix\n"; + + #---Get GENE_URL for the annotation file. + # If user specifies the geneUrl, then use the user specified geneUrl. Otherwise use the + # geneUrl defined in the %annotationFiles hash + my $geneUrl; + if ($opt_u) { + $geneUrl = $opt_u; + } else { + if (defined($_annotationFiles{ $annotationFileSuffix }->{GENE_URL}) ) { + $geneUrl = $_annotationFiles{ $annotationFileSuffix }->{GENE_URL}; +# $geneUrl .= "" if ($geneUrl !~ /\organismFileExist($annotationFile); + +=cut + + my ($self, $annotationFile) = @_; + + #---Get the annotation file's file extension (i.e. suffix) + my $annotationFileSuffix = $self->_annotationFileSuffix($annotationFile); + $DEBUG && print "annotationFileSuffix: $annotationFileSuffix\n"; + + #---Check if the annotation file exists + if(exists($_annotationFiles{ $annotationFileSuffix }) ) { + return(1); + } + else { + return(0); + } +} + +1; #make sure the file returns true or require will fail + + + +############################################################################## +# Additional POD (Plain Old Documentation) from here on down +############################################################################## + +__END__ + +=pod + +=head1 AUTHOR + +Linda McMahan, lmcmahan@genomics.princeton.edu + +=cut diff --git a/www/lib/GO/OntologyProvider/OboParser.pm b/www/lib/GO/OntologyProvider/OboParser.pm new file mode 100644 index 0000000..8be69e0 --- /dev/null +++ b/www/lib/GO/OntologyProvider/OboParser.pm @@ -0,0 +1,1042 @@ +package GO::OntologyProvider::OboParser; + +# File : OboParser.pm +# Authors : Elizabeth Boyle; Gavin Sherlock +# Date Begun : Summer 2001 +# Rewritten : September 29th 2002 +# +# Updated to parse the gene ontology info from the obo file. +# August 2006, Shuai Weng +# +# $Id: OboParser.pm,v 1.6 2011/06/03 18:01:40 mark Exp $ + +# License information (the MIT license) + +# Copyright (c) 2003 Gavin Sherlock; Stanford University + +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation files +# (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, +# publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: + +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +=pod + +=head1 NAME + +GO::OntologyProvider::OboParser - Provides API for retrieving data from Gene Ontology obo file. + +=head1 SYNOPSIS + + use GO::OntologyProvider::OboParser; + + my $ontology = GO::OntologyProvider::OboParser->new(ontologyFile => "gene_ontology.obo", + aspect => [P|F|C]); + + print "The ancestors of GO:0006177 are:\n"; + + my $node = $ontology->nodeFromId("GO:0006177"); + + foreach my $ancestor ($node->ancestors){ + + print $ancestor->goid, " ", $ancestor->term, "\n"; + + } + + $ontology->printOntology(); + + +=head1 DESCRIPTION + +GO::OntologyProvider::OboParser implements the interface defined by +GO::OntologyProvider, and parses the gene ontology obo file (GO) in +plain text (not XML) format. These files can be obtained from the +Gene Ontology Consortium web site, http://www.geneontology.org/. From +the information in the file, it creates a directed acyclic graph (DAG) +structure in memory. This means that GO terms are arranged into +tree-like structures where each GO node can have multiple parent nodes +and multiple child nodes. The file MUST be named with a .obo suffix. + +This data structure can be used in conjunction with files in which +certain genes are annotated to corresponding GO nodes. + +Each GO ID (e.g. "GO:1234567") has associated with it a GO node. That +GO node contains the name of the GO term, a list of the nodes directly +above the node ("parent nodes"), and a list of the nodes directly +below the current node ("child nodes"). The "ancestor nodes" of a +certain node are all of the nodes that are in a path from the current +node to the root of the ontology, with all repetitions removed. + +The example format is as follows: + +[Term] +id: GO:0000006 +name: high affinity zinc uptake transporter activity +namespace: molecular_function +def: "Catalysis of the reaction: Zn2+(out) = Zn2+(in), probably powered by proton motive force." [TC:2.A.5.1.1] +xref_analog: TC:2.A.5.1.1 +is_a: GO:0005385 ! zinc ion transporter activity + + +[Term] +id: GO:0000005 +name: ribosomal chaperone activity +namespace: molecular_function +def: "OBSOLETE. Assists in the correct assembly of ribosomes or ribosomal subunits in vivo, but is not a component of the assembled ribosome when performing its normal biological function." [GOC:jl, PMID:12150913] +comment: This term was made obsolete because it refers to a class of gene products and a biological process rather than a molecular +function. To update annotations, consider the molecular function term 'unfolded protein binding ; GO:0051082' and the biological process +term 'ribosome biogenesis and assembly ; GO:0042254' and its children. +is_obsolete: true + +=cut + +################################################################## +################################################################## + +use strict; +use warnings; +use diagnostics; + +use base qw (GO::OntologyProvider); +use GO::Node; +use Storable qw (nstore); + +use IO::File; + +our $VERSION = 0.01; +our $PACKAGE = "GO::OntologyProvider::OntologyOboParser"; + +################################################################## +# +# CLASS ATTRIBUTES +# +################################################################## + +# All the following class attributes are constants, that should be +# initialized here at compile time. + +my $DEBUG = 0; + +my $kFile = $PACKAGE.'::__file'; +my $kAspect = $PACKAGE.'::__aspect'; +my $kRootNode = $PACKAGE.'::__rootNode'; +my $kNodes = $PACKAGE.'::__nodes'; +my $kSecondaryIds = $PACKAGE.'::__secondaryIds'; +my $kParent = $PACKAGE.'::__parent'; +my $kCVSVersion = $PACKAGE.'::__cvsVersion'; # mark +my $kVersionDate = $PACKAGE.'::__versionDate'; # mark + +my %kAspects = ( + 'P' => 'biological_process', + 'F' => 'molecular_function', + 'C' => 'cellular_component' + ); + +################################################################## + +# The constructor, and associated initialization methods + +################################################################## +sub new { +################################################################## +# This is the constructor for an OntologyOboParser object. +# +# The constructor expects one of two type of arguments, either an +# 'ontologyFile' and 'ontology' argument , or an 'objectFile' argument. +# When instantiated with an ontologyFile argument, it expects the file +# to be in obo format. When instantiated with an objectFile argument, +# it expects to open a previously created OboParser object that +# has been serialized to disk. +# +# +# Usage : +# +# my $ontology = GO::OntologyProvider::OboParser->new(ontologyFile=>$file, +# ontology=>[P|F|C]); +# +# my $ontology = GO::OntologyProvider::OboParser->new(objectFile=>$file); +# + + my ($class, %args) = @_; + + my $self; + + if (exists($args{'objectFile'})){ + + $self = Storable::retrieve($args{'objectFile'}) + + }elsif (exists($args{'ontologyFile'})){ + + $self = {}; + + bless $self, $class; + + $self->__setFile($args{'ontologyFile'}, + $args{'aspect'}); + +# $self->__init($args{'termByGoidHash'}, $args{'nodeByTermHash'}); # hash from goids to terms, terms to nodes - mark + $self->__init(%args); # mark + + } + + return ($self); + +} + +############################################################################ +sub __setFile{ +############################################################################ +# This private method simply stores the name of the file used for +# construction inside the object's hash + + my ($self, $file, $aspect) = @_; + + if (!-e $file){ + + die "$file does not exist"; + + }elsif (-d $file){ + + die "$file is a directory"; + + }elsif (!-r $file){ + + die "$file is not readable"; + + }elsif ($file !~ /\.obo/){ + + die "$file must have a .obo suffix"; + + } + + if (!defined $aspect) { + + die "You have to pass the GO aspect [".join("\|", sort keys %kAspects) ."] to the ", ref($self); + + } + + if (!exists $kAspects{$aspect}) { + + die "Unknown aspect name: $aspect. The allowable GO aspects are ". join(", ", sort keys %kAspects)."\n"; + + } + + $self->{$kFile} = $file; + + $self->{$kAspect} = $aspect; + +} + +############################################################################ +sub __file { +############################################################################ +# This private method returns the name of the file used to construct the object + + return $_[0]->{$kFile}; + +} + +############################################################################ +sub __aspect { +############################################################################ +# This private method returns the name of the ontology used to construct the object + + return $_[0]->{$kAspect}; + +} + +############################################################################ +sub __init { # okay +############################################################################ +# This method initializes the ontologyOboParser object, by parsing an ontology +# file, and storing the structures represented therein, in memory. + + my $self = shift; + my %args = @_; + my $termByGoidHash = $args{termByGoidHash}; # if requested, maintain hash of goids to terms - mark + my $nodeByTermHash = $args{nodeByTermHash}; + + my $ontologyFh = IO::File->new($self->__file, q{<} )|| die "$PACKAGE can't open file ". $self->__file ." : $!"; + + my $aspect = $kAspects{$self->__aspect}; + + my @entryLine; + + my $isValidEntry = 0; + + my $namespace; + + my $headers = 1; + + while (<$ontologyFh>){ + + chomp; + + # first read header to get version and date -- mark + if ($headers) { + if (/^\s*date:\s+(\d+)\:(\d+)\:(\d+)\s/o) { + my $date = "$2/$1/$3"; + $self->{$kVersionDate} = $date; + } elsif (/cvs.*\$\s*Revision\:\s*([\d\.]+)(\s|\$)/o) { + $self->{$kCVSVersion} = $1; + } elsif (/^\s*$/) { + $headers = 0; + } + next; + } + + # finish parsing the obo file of we reach the typedef line. + + last if (/^\[Typedef\]/); + + if ($_ eq '[Term]') { + + # we reached a new term - so process the previous entry + + if ($isValidEntry) { + + $self->__processNode(\@entryLine, %args); + + } + + # reset our variables + + @entryLine = (); + + $isValidEntry = 0; + $namespace = ''; + + }elsif ($_ eq "namespace: $aspect"){ + + # term is in the requested namespace + + $namespace = $aspect; + + $isValidEntry = 1; + + }elsif ($_ eq 'is_obsolete: true'){ + + # we don't want obsolete nodes - DO NOT COMMENT THIS OUT - + # infinite recursion will result! + + # Note, the logic here relies on the is_obsolete line coming after the + # namespace line. + + $isValidEntry = 0; + + }else { + + # build up the information for this node + + push(@entryLine, $_); + + } + + } + + # process the final node + + if ($namespace eq $aspect && $isValidEntry) { + + $self->__processNode(\@entryLine, %args); + + } + + $ontologyFh->close || die "Can't close ". $self->__file ." : $!"; + + # now populate ancestor paths for each node. + + $self->__populatePaths; + +} + +############################################################################ +sub __processNode { +############################################################################ +# This private method processes entry lines identified as a node. +# The general idea is that it needs three pieces of +# information about the line to deal with it: +# +# 1. The name of the node. +# 2. The GOIDs associated with the node. +# 3. The parent node ids. +# +# It creates a node object for the current node and then indicates in that node +# the identity of its parent(s). + + my ($self, $entryLineArrayRef, %args) = @_; + my $termByGoidHash = $args{termByGoidHash}; + my $nodeByTermHash = $args{nodeByTermHash}; + + my ($nodeName, $goid, $secondaryGoidArrayRef, $parentGoidArrayRef) + = $self->__getNodeInfoFromLine($entryLineArrayRef, %args); + + my $node = $self->__createNode($goid, $nodeName); + + # to save memory when storing id -> term hash, we store only references + # note - this still duplicates terms since they are stored as node->term as well, + # but at least it does not duplicate terms for synonym goids + # - mark + my $termRef; + + if (scalar (@{$parentGoidArrayRef}) == 0) { # no parent goid + + # The GOA has obsoleted the 'Gene_Ontology' term, but + # currently we need it to make the graph work. Thus, we'll + # recreate the root, using it.s original id and name. This + # needs to be fixed in future. + + my $rootGoid = 'GO:0003673'; + my $rootTerm = 'Gene_Ontology'; + + my $rootNode = $self->__createNode($rootGoid, $rootTerm); + + $self->{$kRootNode} = $rootNode; + + @{$parentGoidArrayRef} = ($rootGoid); + + # don't forget to save the root goid map to the root term - mark + if (defined($termByGoidHash)) { + $termRef = $termByGoidHash->{$rootGoid} = \$rootTerm; +# print "HASH $rootGoid\n"; + } + if (defined($nodeByTermHash)) { + $nodeByTermHash->{$rootTerm} = $self; + } + + } else { + + # save the hash of the primary id to the term - mark + if (defined($termByGoidHash)) { + my $term = $node->term; + $termRef = $termByGoidHash->{$node->goid} = \$term; +# print "HASH ".$node->goid."\n"; + } + if (defined($nodeByTermHash)) { + $nodeByTermHash->{$node->term} = $self; + } + + } + + ## now hash any secondaries to the primary + + foreach my $secondaryId (@{$secondaryGoidArrayRef}){ + + $self->{$kSecondaryIds}{$secondaryId} = $goid; + + # don't forget to add these to the term hash - mark + # actually we only want primaries + # actually it doesn't seem to make a difference +# if (defined($termByGoidHash)) { +## $termByGoidHash->{$goid} = $termRef; +# print "NOHASH $goid\n"; +# } + + } + + $self->{$kParent}{$goid} = $parentGoidArrayRef; + +} + +############################################################################ +sub __getNodeInfoFromLine { # okay +############################################################################ + +# This private method takes an array reference to the lines for a +# given GO term node entry and returns the term name, a reference +# that points to an array of goids associated with that term name, and +# a reference that points to an array of direct parent GOIDs. The +# primary goid will be the first goid returned in the list. +# +# Usage: +# +# my ($termName, $goidArrayRef, $parentGoidArrayRef) +# = $self->__getNodeInfoFromLine($entryLineArrayRef); + + my ($self, $entryLineArrayRef, %args) = @_; + + my ($nodeName, $goid, @secondaryGoid, @parentGoid); + + foreach my $line (@{$entryLineArrayRef}) { + + if ($line =~ /^id: *(GO:0*[0-9]+)$/) { + + $goid = $1; + + }elsif ($line =~ /^name: *(.+)$/) { + + $nodeName = $1; + + }elsif ($line =~ /^alt_id: *(GO:0*[0-9]+)$/) { + + push(@secondaryGoid, $1); + + }elsif ($line =~ /^(is_a:|relationship: part_of) *(GO:0*[0-9]+)/) { + + push(@parentGoid, $2); + + }elsif (($args{linkRegulates}) && ($line =~ /^relationship: ((positively_|negatively_)?regulates) *(GO:0*[0-9]+)/)) { + # optionally follow regulation links - mark + + push(@parentGoid, $3); + + } + } + + # check that we can actually get some goids. Added this in to + # deal with when a broken file that appeared on the GO site, it + # caused me to get email saying my code was broken... + + if (!$goid){ + + die "ERROR: There appears to be a problem with the ontology file.\n". + "No GOIDs could be extracted from '$nodeName'.n\n"; + + } + + # remove \'s from nodeName + + $nodeName =~ s/\\//g; + + return ($nodeName, $goid, \@secondaryGoid, \@parentGoid); + + +} + +############################################################################### +sub __createNode { +############################################################################### + + my ($self, $goid, $nodeName) = @_; + + my $node; + + if ($self->__nodeIsAlreadyCreated($goid)){ + + $node = $self->nodeFromId($goid); + + } else { # node has not already been created + + # create node + + $node = GO::Node->new(goid => $goid, + term => $nodeName); + + # store it + + $self->{$kNodes}{$goid} = $node; + + } + + return $node; + +} + +############################################################################### +sub __populatePaths { +############################################################################### +# in this method, we populate all the paths to the root for each node +# in the ontology. To do this, we have to call the recursive method, +# __findAncestor(), which will build up each path from a node to the +# root, and when it reaches the end of the path (the root itself), +# will add that path via the Node method addPathToRoot. + +# POSSIBLE ALTERNATIVE APPROACH +# +# Profiling of the OboParser reveals that when building the ontology, +# ~77% of the time is spent in the recursive __findAncestor(). Thus, +# if a way could be found to decrease the number of recursive calls to +# that method, it might significantly positively impact the runtime +# performance. +# +# A possible alternative approach to the current method, might be to +# simply populate paths for every leaf node (we would need to know who +# they are), and as their paths are populated, also populate the paths +# for their ancestors as well, as the paths from the ancestors are +# subparts of the paths from leaves to the root. However, care would +# have to be taken to not add the same path twice, as there would be +# issues with when a leaf has two or more paths to a particular node, +# whose paths are then being added. Note also, if you encounter a +# node for whom you have already added paths, you don't need to add +# them again, so this might significantly save the number of recursive +# calls required. + + my $self = shift; + + # First, remove all links to missing nodes. + # If the parent of a node does not exist, it is probably because it is in a different namespace + # (or there is an error in the file). When we detect this, we must delete the link to that parent. -- mark + + foreach my $childGoid (keys %{$self->{$kParent}}) { + for (my $p = 0; $p <= $#{$self->{$kParent}{$childGoid}}; $p++) { + my $parentGoid = $self->{$kParent}{$childGoid}->[$p]; + my $parentNode = $self->{$kNodes}{$parentGoid}; + if (!$parentNode) { + print "REMOVING NON-EXISTENT PARENT $parentGoid FROM $childGoid\n"; + splice(@{$self->{$kParent}{$childGoid}}, $p, 1); + $p--; + } + } + } + + # go through each GO node in the $kParent hash, the keys of which + # are the goids that are parents of a given node. + + foreach my $childGoid ( keys %{$self->{$kParent}} ) { + + # note, we directly access the kNodes hash here, rather than + # use nodeFromId(). This is for performance reasons only - + # accessing the kNodes hash directly in this method, and the + # __findAncestor method shaces about 40% of the runtime off of + # the time taken to populate all the paths. + +# print $childGoid.":"; + + my $childNode = $self->{$kNodes}{$childGoid}; + + # now go through each of this child's parents + + foreach my $parentGoid (@{$self->{$kParent}{$childGoid}}) { + + ### Note, there has been a case in the obo file where + ### there was an error, and a node was listed as having + ### parent in a different aspect. This results in a fatal + ### run time error, as when the parser reads the file, it + ### only keeps nodes of a given aspect, and is thus left + ### with a dangling reference. In this case, parentNode + ### will be undef, and the call to addParentNodes ends up + ### in a run time error. We can add some logic here to + ### give a better error message. + + #! This was lifted from GO-TermFinder-0.83. This will happen + #! since they are now officially adding references between GO IDs + #! in different aspects (at least in slims, probably others. + #! We will need to find a way to deal with it. - mark + + my $parentNode = $self->{$kNodes}{$parentGoid}; +# +# || do { +# +# print "There is an error in the obo file, where the relationship between ", +# $childNode->goid, +# " and one or more of its parents is not correctly defined.\n", +# "Please check the obo file.\n", +# "The program is unable to continue.\n\n"; +# +# exit; +# +# }; + + # not needed -- mark +# #! instead, lets just ignore it +# if (!$parentNode) { +# print "NO NODE $parentNode\n"; +# next; +# } + + ### create connections between child node and its parent + + $childNode->addParentNodes($parentNode); + + $parentNode->addChildNodes($childNode); + + # begin to build the ancestor path, starting with this + # parent + + my @path = ($parentNode); + + if (exists $self->{$kParent}{$parentGoid}){ + + # if this parent has parents, then we continue to + # build the path upwards to the root. We pass in the + # child node, so that each path which reaches the root + # can be added during the recursive calls to find + # ancestor + + $self->__findAncestors($childNode, + $parentGoid, + \@path); + + }else{ + + # otherwise, the path only contains the root, and we add it. + + $childNode->addPathToRoot(@path); + + } + + } + +# print "\n"; + + } + +} + +####################################################################### +sub __findAncestors { +####################################################################### +# Usage: +# +# $self->__findAncestor($childNode, +# $parentGoid, +# $pathArrayRef); +# +# This method looks through each goid in hash %{$self->{$kParent}} to +# find all ancestors and push everything to @{$pathArrayRef}..And if +# there is no ancestor found for the $parentGoid, it just add the path +# to the child node. + + my ($self, $childNode, $parentGoid, $pathArrayRef) = @_; + + # go through each immediate parent of the passed in parent + +# print " ".$parentGoid; + + # If the parent of the passed in parent does not exist, it is probably because it is in a different namespace + # (or there is an error in the file). When we detect this, we must delete the link to that parent. -- mark + + foreach my $ancestorGoid (@{$self->{$kParent}{$parentGoid}}) { + +# for testing - no longer needed -- mark +# if ($ancestorGoid eq "GO:0050789") { +# print "EXISTS $ancestorGoid\n"; +# } + +# for testing - no longer needed -- mark +# if (!exists $self->{$kNodes}{$ancestorGoid}) { +# print "NOT EXISTS $ancestorGoid\n"; +# } + + # add the ancestor node to our path to the root which is being + # built + + push (@{$pathArrayRef}, $self->{$kNodes}{$ancestorGoid}); + + if (exists $self->{$kParent}{$ancestorGoid}){ + + # if this ancestor has parents, continue building the + # paths to the root recursively up the DAG + + $self->__findAncestors($childNode, + $ancestorGoid, + $pathArrayRef); + + }else { + + + $childNode->addPathToRoot(reverse @{$pathArrayRef}); + + } + + # because there are multiple paths to the root for most nodes, + # we have now remove the current ancestor from this time + # through the loop so that the path is reset to the original + # condition that it was in when passed in to this method + + pop @{$pathArrayRef}; + + } + +} + +############################################################################ +sub __nodeIsAlreadyCreated { # okay +############################################################################ +# This private method returns a boolean to indicate whether a node has +# already been created for a given GO ID. + + + return (exists($_[0]->{$kNodes}{$_[1]})); + +} + +############################################################################ +sub printOntology{ +############################################################################ +# This prints out the ontology, with redundancies. + + my $self = shift; + + $self->__printNode($self->rootNode, 0); + +} + +############################################################################ +sub __printNode{ +############################################################################ +# This recursive function prints the name of the specified node and the +# names of all of its descendants. +# + + my ($self, $node, $indentationLevel) = @_; + + print " " x $indentationLevel, $node->term, " ; ", $node->goid, "\n"; + + foreach my $childNode (sort {$a->term cmp $b->term} $node->childNodes) { + + $self->__printNode($childNode, $indentationLevel+1); + + } + +} + +############################################################################ +sub allNodes{ +############################################################################ +# This method returns an array of all the nodes that have been created. +# +# Usage: +# +# my @nodes = $ontologyParser->allNodes; + + return (values %{$_[0]->{$kNodes}}); + +} + +############################################################################ +sub rootNode{ +############################################################################ +# This returns the root node in the ontology. +# +# Usage: +# +# my $rootNode = $ontologyParser->rootNode; + + return ($_[0]->{$kRootNode}); + +} + +############################################################################ +sub nodeFromId{ +############################################################################ +# This public method takes a GOID and returns the GO::Node that +# it corresponds to. It should also work with secondary id's +# +# Usage : +# +# my $node = $ontologyParser->nodeFromId($goid); + + my ($self, $goid) = @_; + + if (exists ($self->{$kNodes}{$goid})){ # it's a primary + + return ($self->{$kNodes}{$goid}); + + }elsif (exists ($self->{$kSecondaryIds}{$goid})){ # it's a secondary + + return $self->{$kNodes}{$self->{$kSecondaryIds}{$goid}}; + + }else{ + + return undef; + + } + +} + +############################################################################ +sub numNodes{ +############################################################################ +# This public method returns the number of nodes that exist with the +# ontology +# +# Usage : +# +# my $numNodes = $ontologyParser->numNodes; + + return scalar (keys %{$_[0]->{$kNodes}}); + +} + +############################################################################ +sub serializeToDisk { +############################################################################ +# Saves the current state of the Ontology Parser Object to a file, +# using the Storable package. Saves in network order for portability, +# just in case. Returns the name of the file. If no filename is +# provided, then the name of the file (and it's directory, if one was +# provided) used for object construction, will be used, with .obj +# appended. If the object was instantiated from a file with a .obj +# suffix, then the same filename would be used, if none were provided. +# +# This method currently causes a segfault on MacOSX (at least 10.1.5 +# -> 10.2.3), with perl 5.6, and Storable 1.0.14, when trying to store +# the process ontology. This failure occurs using either store, or +# nstore, and is manifested by a segmentation fault. It has not been +# investigated whether this is a perl problem, or a Storable problem +# (which has large amounts of C-code). This does not cause a +# segmentation on Solaris, using perl 5.6.1 and Storable 1.0.13. This +# doesn't make it clear whether it's a MacOSX problem or a perl +# problem or not. It should be noted that newer versions of both perl +# and Storable exist, and the code should be tested with those as +# well. +# +# Usage: +# +# my $objectFile = $ontologyParser->serializeToDisk(filename=>$filename); + + my ($self, %args) = @_; + + my $fileName; + + if (exists ($args{'filename'})){ # they supply their own filename + + $fileName = $args{'filename'}; + + }else{ # we build a name from the file used to instantiate ourselves + + $fileName = $self->__file; + + if ($fileName !~ /\.obj$/){ # if we weren't instantiated from an object + + $fileName .= ".obj"; # add a .obj suffix to the name + + } + + } + + nstore ($self, $fileName) || die "$PACKAGE could not serialize itself to $fileName : $!"; + + return ($fileName); + +} + + +sub version { + my $self = shift; + return $self->{$kCVSVersion}; +} + + +sub versionDate { + my $self = shift; + return $self->{$kVersionDate}; +} + + +1; # to keep perl happy + + +# P O D D O C U M E N T A T I O N # + +=pod + +=head1 Instance Constructor + +=head2 new + +This is the constructor for an OboParser object. The constructor +expects one of two arguments, either an 'ontologyFile' argument, or an +'objectFile' argument. When instantiated with an ontologyFile +argument, it expects it to correspond to an obo file created by the GO +consortium, according to their file format, and in addition, also +requires an 'aspect' argument. When instantiated with an objectFile +argument, it expects to open a previously created ontologyParser +object that has been serialized to disk (see serializeToDisk). + +Usage: + + my $ontology = GO::OntologyProvider::OboParser->new(ontologyFile => $ontologyFile, + aspect => $aspect); + + my $ontology = GO::OntologyProvider::OboParser->new(objectFile => $objectFile); + +=head1 Instance Methods + +=head2 printOntology + +This prints out the ontology, with redundancies, to STDOUT. It does +not yet print out all of the ontology information (like relationship +type etc). This method will be likely be removed in a future version, +so should not be relied upon. + +Usage: + + $ontologyParser->printOntology; + +=head2 allNodes + +This method returns an array of all the GO:Nodes that have been +created. + +Usage: + + my @nodes = $ontologyParser->allNodes; + +=head2 rootNode + +This returns the root node in the ontology. + + my $rootNode = $ontologyParser->rootNode; + +=head2 nodeFromId + +This public method takes a GOID and returns the GO::Node that +it corresponds to. + +Usage : + + my $node = $ontologyParser->nodeFromId($goid); + +If the GOID does not correspond to a GO node, then undef will be +returned. Note if you try to call any methods on an undef, you will +get a fatal runtime error, so if you can't guarantee all GOIDs that +you supply are good, you should check that the return value from this +method is defined. + +=head2 numNodes + +This public method returns the number of nodes that exist with the +ontology + +Usage : + + my $numNodes = $ontologyParser->numNodes; + +=head2 serializeToDisk + +Saves the current state of the Ontology Parser Object to a file, using +the Storable package. Saves in network order for portability, just in +case. Returns the name of the file. If no filename is provided, then +the name of the file (and its directory, if one was provided) used for +object construction, will be used, with .obj appended. If the object +was instantiated from a file with a .obj suffix, then the same +filename would be used, if none were provided. + +This method currently causes a segfault on MacOSX (at least 10.1.5 -> +10.2.3), with perl 5.6, and Storable 1.0.14, when trying to store the +process ontology. This failure occurs using either store, or nstore, +and is manifested by a segmentation fault. It has not been +investigated whether this is a perl problem, or a Storable problem +(which has large amounts of C-code). This does not cause a +segmentation on Solaris, using perl 5.6.1 and Storable 1.0.13. This +does not make it clear whether it is a MacOSX problem or a perl +problem or not. It should be noted that newer versions of both perl +and Storable exist, and the code should be tested with those as well. + +Usage: + + my $objectFile = $ontologyParser->serializeToDisk(filename=>$filename); + +=head1 Authors + + Gavin Sherlock; sherlock@genome.stanford.edu + Elizabeth Boyle; ell@mit.edu + Shuai Weng; shuai@genome.stanford.edu + +=cut diff --git a/www/lib/GO/README b/www/lib/GO/README new file mode 100644 index 0000000..7f57d04 --- /dev/null +++ b/www/lib/GO/README @@ -0,0 +1 @@ +Some of these files are from the GO::TermFinder perl module, modified to support features of these tools. They must override those provided by GO::TermFinder. diff --git a/www/lib/GO/TermFinder.pm b/www/lib/GO/TermFinder.pm new file mode 100644 index 0000000..8cc75c8 --- /dev/null +++ b/www/lib/GO/TermFinder.pm @@ -0,0 +1,2134 @@ +package GO::TermFinder; + +# File : TermFinder.pm +# Author : Gavin Sherlock +# Date Begun : December 31st 2002 + +# $Id: TermFinder.pm,v 1.6 2015/01/29 00:22:38 mark Exp $ + +# License information (the MIT license) + +# Copyright (c) 2003-2006 Gavin Sherlock; Stanford University + +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation files +# (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, +# publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: + +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +=pod + +=head1 NAME + +GO::TermFinder - identify GO nodes that annotate a group of genes with a significant p-value + +=head1 DESCRIPTION + +This package is intended to provide a method whereby the P-values of a +set of GO annotations can be determined for a set of genes, based on +the number of genes that exist in the particular genome (or in a +selected background distribution from the genome), and their +annotation, and the frequency with which the GO nodes are annotated +across the provided set of genes. The P-value is simply calculated +using the hypergeometric distribution as the probability of x or more +out of n genes having a given annotation, given that G of N have that +annotation in the genome in general. We chose the hypergeometric +distribution (sampling without replacement) since it is more accurate, +though slower to calculate, than the binomial distribution (sampling +with replacement). + +In addition, a corrected p-value can be calculated, to correct for +multiple hypothesis testing. The correction factor used is the total +number of nodes to which the provided list of genes are annotated, +excepting any nodes which have only a single annotation in the +background, as a priori, we know that these cannot be significantly +enriched. The client has access to both the corrected and uncorrected +values. It is also possible to correct the p-value using 1000 +simulations, which control the Family Wise Error Rate - using this +option suggests that the Bonferroni correction is in fact somewhat +liberal, rather than conservative, as might be expected. Finally, the +False Discovery Rate can also be calculated. + +The general idea is that a list of genes may have been identified for +some reason, e.g. they are co-regulated, and TermFinder can be used to +find out if any nodes annotate the set of genes to a level which is +extremely improbable if the genes had simply been picked at random. + +=head1 TODO + +1. May want the client to decide the behavior for ambiguous names, + rather than having it hard coded (e.g. always ignore; use if + standard name (current implementation); use all databaseIds for + the ambiguous name; decide on a case by case basis (potentially + useful if running on command line)). + +2. Create new GO::Hypothesis and GO::HypothesisSet objects, so that + it is easier to access the information generated about the p-value + etc. of any particular GO node that annotates a set of genes. + +3. Instead of all the global variables, $k..., replace them with + constants, which may improve runtime, as the optimizer should + optimize the hash look ups to look like hard-coded strings at + runtime, rather than variable lookups. + +4. Lots of other stuff.... + +=cut + +use strict; +use warnings; +use diagnostics; + +use vars qw ($PACKAGE $VERSION $WARNINGS); + +use GO::Node; +use GO::TermFinder::Native; + +$VERSION = '0.83'; +$PACKAGE = 'GO::TermFinder'; + +$WARNINGS = 1; # toggle this to zero if you don't want warnings + +# class variables + +my @kRequiredArgs = qw (annotationProvider ontologyProvider aspect); + +my $kArgs = $PACKAGE.'::__args'; +my $kPopulationNamesHash = $PACKAGE.'::__populationNamesHash'; +my $kBackgroundDatabaseIds = $PACKAGE.'::__backgroundDatabaseIds'; +my $kTotalGoNodeCounts = $PACKAGE.'::__totalGoNodeCounts'; +my $kGoCounts = $PACKAGE.'::__goCounts'; +my $kGOIDsForDatabaseIds = $PACKAGE.'::__goidsForDatabaseIds'; +my $kDatabaseIds = $PACKAGE.'::__databaseIds'; +my $kTotalNumAnnotatedGenes = $PACKAGE.'::__totalNumAnnotatedGenes'; +my $kCorrectionMethod = $PACKAGE.'::__correctionMethod'; +my $kShouldCalculateFDR = $PACKAGE.'::__shouldCalculateFDR'; +my $kPvalues = $PACKAGE.'::__pValues'; +my $kDatabaseId2OrigName = $PACKAGE.'::__databaseId2OrigName'; +my $kDistributions = $PACKAGE.'::__distributions'; +my $kDiscardedGenes = $PACKAGE.'::__discardedGenes'; +my $kDirectAnnotationToAspect = $PACKAGE.'::__directAnnotationToAspect'; + +# the methods by which the p-value can be corrected + +my %kAllowedCorrectionMethods = ('bonferroni' => undef, + 'none' => undef, + 'simulation' => undef); + +# set up a GO node that corresponds to anything passed in that has no +# annotation + +my $kUnannotatedNode = GO::Node->new(goid => "GO:XXXXXXX", + term => "unannotated"); + +my $kFakeIdPrefix = "NO_DETERMINED_DATABASE_ID_"; + + +##################################################################### +sub new{ +##################################################################### +=pod + +=head1 Instance Constructor + +=head2 new + +This is the constructor. It expects to be passed named arguments for +an annotationProvider, and an ontologyProvider. In addition, it must +be told the aspect of the ontology provider, so that it knows how to +query the annotationProvider. + +There are also some additional, optional arguments: + +population: + +This argument allows a client to indicate the population that should +used to calculate a background distribution of GO terms. In the +absence of population argument, then the background distribution will +be drawn from all genes in the annotationProvider. This should be +provided as an array reference, and no ambiguous names should be +provided (see AnnotationProvider for details of name ambiguity). This +option is particularly pertinent in a case where for example you +assayed only 2000 genes in a two hybrid experiment, and found 20 +interesting ones. To find significant terms, you need to do it in the +context of the genes that you assayed, not in the context of all genes +with annotation. + +Note, new in version 0.71, if you provided a population as the +background distribution from which genes have been drawn, any genes +provided to the findTerms method that are not in the background +distribution will be discarded from the calculations. The identity of +these genes can be retrieved using the discardedGenes() method, after +the findTerms() method has been called. + +totalNumGenes: + +This argument allows a client to indicate that the size of the +background distribution is in fact larger that the number of genes +that exist in the annotation provider, and the extra genes are merely +assumed to be entirely unannotated. + +NB: This is an API change, as totalNumGenes was previously required. + +Thus - if using 'population', the total number of genes considered as +the background will be the number of genes in the provided population. +If not using 'population', then the number of genes that will be +considered as the total population will be the number of genes in the +annotationProvider. However, if the totalNumGenes argument is +provided, then that number will be used as the size of the population. +If it is not larger than the total number of genes in the +annotationParser, then the number of genes in the annotationParser +will be used. The totalNumGenes and the population arguments are +mutually exclusive, and both should not be provided at the same time. + +Usage ($num is larger than the number of genes with annotations): + + my $termFinder = GO::TermFinder->new(annotationProvider=> $annotationProvider, + ontologyProvider => $ontologyProvider, + totalNumGenes => $num, + aspect => ); + + +Usage (use all annotated genes as population): + + my $termFinder = GO::TermFinder->new(annotationProvider=> $annotationProvider, + ontologyProvider => $ontologyProvider, + aspect => ); + +Usage (use a subset of genes as the background population): + + my $termFinder = GO::TermFinder->new(annotationProvider=> $annotationProvider, + ontologyProvider => $ontologyProvider, + population => \@genes, + aspect => ); + +=cut + + my ($class, %args) = @_; + + my $self = {}; + + bless $self, $class; + + $self->__checkAndStoreArgs(%args); + + $self->__init; # initialize counts for all GO nodes + + return $self; + +} + +##################################################################### +sub __checkAndStoreArgs{ +##################################################################### +# This private method simply checks that all the required arguments +# have been provided, and stores them within the object + + my ($self, %args) = @_; + + # first check that the required arguments were provided + + foreach my $arg (@kRequiredArgs){ + + if (!exists ($args{$arg})){ + + die "You did not provide a $arg argument."; + + }elsif (!defined ($args{$arg})){ + + die "Your $arg argument is not defined"; + + } + + $self->{$kArgs}{$arg} = $args{$arg}; # store in object + + } + + # store the population, and also create a hash of the population + # names for quick look up + + if (exists($args{'population'})){ + + $self->{$kArgs}{'population'} = $args{'population'}; + + my %population; + + foreach my $name (@{$args{'population'}}){ + + $population{$name} = undef; + + } + + $self->{$kPopulationNamesHash} = \%population; + + } + + if (exists($args{'totalNumGenes'})){ + + $self->{$kArgs}{'totalNumGenes'} = $args{'totalNumGenes'}; + + } + + # now check that we didn't get a funky combination + + if (exists($args{'population'}) && exists($args{'totalNumGenes'})){ + + die "ERROR: The population and totalNumGenes arguments are mutually exclusive, but you have provided both."; + + } + +} + +##################################################################### +sub __init{ +##################################################################### +# This private method determines all counts to all GO nodes, as the +# background frequency of annotations in the genome + + my ($self) = @_; + + # first we determine the databaseIds for the background + # distribution + + my @databaseIds; + + if ($self->__isUsingPopulation){ + + # we need to get databaseids for the provided population + + my ($databaseIdsRef, $databaseId2OrigNameRef) = $self->__determineDatabaseIdsFromGenes($self->__population); + + @databaseIds = @{$databaseIdsRef}; + + }else{ + + # we simply use all databaseIds from the annotationProvider + + @databaseIds = $self->__annotationProvider->allDatabaseIds(); + + } + + my $populationSize = scalar(@databaseIds); + + # check that they said there's at least as many genes in total + # as the annotation provider says that there is. + + if (! defined $self->totalNumGenes){ + + # in this case, no 'totalNumGenes' argument was provided + + $self->{$kArgs}{totalNumGenes} = $populationSize; + + }elsif ($populationSize > $self->totalNumGenes){ + + # in this case, they are using an annotation provider, and + # have provided a totalNumGenes that is less than the number + # of genes that the annotation provider knows about + + if ($WARNINGS){ + + print STDERR "The annotation provider indicates that there are more genes than the client indicated.\n"; + print STDERR "The annotation provider indicates there are $populationSize, while the client indicated only ", $self->totalNumGenes, ".\n"; + print STDERR "Thus, assuming the correct total number of genes is that indicated by the annotation provider.\n"; + + } + + $self->{$kArgs}{totalNumGenes} = $populationSize; + + } + + # now determine the level of annotation for each GO node in the + # population of genes to be used as the background distribution + my $totalNodeCounts = $self->__buildHashRefOfAnnotations(\@databaseIds); + + # adjust those counts if needs be + + if ($populationSize < $self->totalNumGenes){ + + # if there are extra, entirely unannotated genes (indicated by + # the total number of genes provided being greater than the + # number that existed in the annotation provider), we must + # make sure that it's treated that they will at least be + # annotated to the root (Gene Ontology), and its immediate + # child (which is the name of the Ontology, eg + # Biological_process, Molecular_function, and + # Cellular_component), and the 'unannotated' node + + # so simply add extra annotations + + my $rootNodeId = $self->__ontologyProvider->rootNode->goid; + + my $childNodeId = ($self->__ontologyProvider->rootNode->childNodes())[0]->goid; + + $totalNodeCounts->{$rootNodeId} = $self->totalNumGenes; + + $totalNodeCounts->{$childNodeId} += ($self->totalNumGenes - $populationSize); + + $totalNodeCounts->{$kUnannotatedNode->goid} += ($self->totalNumGenes - $populationSize); + + } + + # and now store the information + + $self->{$kTotalGoNodeCounts} = $totalNodeCounts; + + $self->{$kTotalNumAnnotatedGenes} = $populationSize; + + # set the discarded genes to be a reference to an empty list + # (technically they shouldn't ask to retrieve the discarded genes + # before calling findTerms, but this will prevent such behavior + # from being fatal + + $self->{$kDiscardedGenes} = []; + + # store a hash of the databaseIDs that are in the background set of genes + + my %databaseIds; + + foreach my $databaseId (@databaseIds){ + + $databaseIds{$databaseId} = undef; + + } + + $self->{$kBackgroundDatabaseIds} = \%databaseIds; + + # create a Distributions object, which has C code for all the various + # Math that we will do. + + $self->{$kDistributions} = GO::TermFinder::Native::Distributions->new($self->totalNumGenes); + +} + +=pod + +=head1 Instance Methods + +=cut + +##################################################################### +sub findTerms{ +##################################################################### +=pod + +=head2 findTerms + +This method returns an array of hash references, one for each GO::Node +that was tested as a hypothesis, that indicates which terms annotate +the list of genes with what P-values. The contents of the hashes in +the returned array depend on some of the run time options. They are: + + key value + ------------------------------------------------------------------------- + +Always Present: + + NODE A GO::Node + + PVALUE The P-value for having the observed number of + annotations that the provided list of genes + has to that node. + + NUM_ANNOTATIONS The number of genes within the provided list that + are annotated to the node. + + TOTAL_NUM_ANNOTATIONS The number of genes in the population (total + or provided) that are annotated to the node. + + ANNOTATED_GENES A hash reference, whose keys are the + databaseIds that are annotated to the node, + and whose values are the original name + supplied to the findTerms() method. + +Present if corrected p-values are calculated: + + CORRECTED_PVALUE The CORRECTED_PVALUE is the PVALUE, but corrected + for multiple hypothesis testing, due to the + fact that you are more likely to generate + significant looking p-values if you test a + lot of hypotheses. See below for details of + how this pvalue is calculated, and the + options associated with it. + +Present if p-values were corrected by simulation: + + NUM_OBSERVATIONS The number of simulations in which a p-value as + good as this one, or better, was observed. + +Present if the False Discovery Rate is calculated: + + FDR_RATE The False Discovery Rate - this is a fraction + of how many of the nodes with p-values as good or + better than the node with this FDR would be expected + to be false positives. + + FDR_OBSERVATIONS The average number of nodes during simulations + that had an uncorrected p-value as good or better + than the p-value of this node. + + EXPECTED_FALSE_POSITIVES The expected number of false positives if this node + is chosen as the cut-off. + +The entries in the returned array are sorted by increasing p-value +(i.e. least likely is first). If there is a tie in the p-value, then +the sort order is determined by GOID, using a cmp comparison. + +findTerm() expects to be passed, by reference, a list of gene names +for which terms will be found. If a passed in name is ambiguous (see +AnnotationProvider), then the following will occur: + + 1) If the name can be used as a standard name, it will assume that + it is that. + + 2) Otherwise it will not use it. + +Currently a warning will be printed to STDOUT in the case of an +ambiguous name being used. + +The passed in gene names are converted into a list of databaseIds. If +a gene does not map to a databaseId, then an undef is put in the list +- however, if the same gene name, which does not map to a databaseId, +is used twice then it will produce only one undef in the list. If +more than one gene name maps to the same databaseId (either because +you used the same name twice, or you used an alias as well), then that +databaseId is only put into the list once, and a warning is printed. + +If a gene name does not have any information returned from the +AnnotationProvider, then it is assumed that the gene is entirely +unannotated. For these purposes, TermFinder annotates such genes to +the root node (Gene_Ontology), its immediate child (which indicates +the aspect of the ontology (such as biological_process), and a dummy +go node, corresponding to unannotated. This node will have a goid of +'GO:XXXXXXX', and a term name of 'unannotated'. No other information +will be set up for this GO::Node, so you should not count on being +able to retrieve it. What it does mean is that you can determine if +the predominant feature of a set of genes is that they have no +annotation. + +If more genes are provided that have been indicated exist in the +genome (as provided during object construction), then an error message +will be printed out, and an empty list will be returned. + +In addition, it is possible that for a small list of genes, that no +hypotheses will be tested - in this case, those genes will only have +annotated nodes with a count of 1, other than the Gene_Ontology node +itself, and the node corresponding to the aspect of the ontology. +Neither of these are considered for p-value testing, as a priori they +must have a p-value of 1. + +MULTIPLE HYPOTHESIS CORRECTION + +An optional argument, 'correction' may be used, which indicates what +method of multiple hypothesis correction should be used. Multiple +hypothesis correction attempts to keep the overall chance of getting +any false positives at the same level (e.g. 0.05). Acceptable values +are: + +bonferroni, none, simulation + + : 'bonferroni' will correct the p-values by using as the correction + factor the total number of nodes to which the provided list of + genes are annotated, either directly or indirectly, excepting any + nodes that are annotated only once in the background distribution, + as, a priori, these cannot be overrepresented. + + : 'none' will perform no multiple hypothesis correction + + : 'simulation' will run 1000 simulations with random lists of genes + (the same size as the originally provided gene list), and determine + a corrected value by how many simulations produced a p-value better + than the p-value associated with one of the real hypotheses. + E.g. if a node from the real data has a p-value of 0.05, but a + p-value that good or better is generated in 500 out of 1000 trials, + the corrected pvalue will be 0.5. In the case that a p-value + generated from a real list of genes is never seen in the + simulations, it will be given a corrected p-value of < 0.001, and + the NUM_OBSERVATIONS attribute of the hypothesis will be 0. Using + this option takes 1000 time as long! + +The default for this argument, if not provided, is bonferroni. + +FALSE DISCOVERY RATE + +As a way of preempting the potential problems of using p-values +corrected for multiple hypothesis testing, the False Discovery Rate +can instead be calculated, and you can instead set your cutoff based +on an acceptable false discovery rate, such as 0.01 (1%), or 0.05 (5%) +etc. Thus, the optional argument 'calculateFDR' can be used. A +non-zero value means that the False Discovery Rate will be calculated +for each node, such that you can determine, if you chose your p-value +cut-off at that node, what the FDR would be. The FDR is calculated by +running 50 simulations, and counting the average number of times a +p-value as good or better that a p-value generated from the real data +is seen. This is used as the numerator. The denominator is the +number of p-values in the real data that are as good or better than +it. + +Usage example - in this example, the default (Bonferroni) correction +is used to calculate a corrected p-value, and in addition, the False +Discovery Rate is also calculated: + + my @pvalueStructures = $termFinder->findTerms(genes => \@genes, + calculateFDR => 1); + + my $hypothesis = 1; + + foreach my $pvalue (@pvalueStructures){ + + print "-- $hypothesis of ", scalar @pvalueStructures, "--\n", + + "GOID\t", $pvalue->{NODE}->goid, "\n", + + "TERM\t", $pvalue->{NODE}->term, "\n", + + "P-VALUE\t", $pvalue->{PVALUE}, "\n", + + "CORRECTED P-VALUE\t", $pvalue->{CORRECTED_PVALUE}, "\n", + + "FALSE DISCOVERY RATE\t", $pvalue->{FDR_RATE}, "\n", + + "NUM_ANNOTATIONS\t", $pvalue->{NUM_ANNOTATIONS}, " (of ", $pvalue->{TOTAL_NUM_ANNOTATIONS}, ")\n", + + "ANNOTATED_GENES\t", join(", ", values (%{$pvalue->{ANNOTATED_GENES}})), "\n\n"; + + $hypothesis++; + + } + +If a background population had been provided when the object was +constructed, you should check to see if any of your genes for which +you are finding terms were discarded, due to not being found in the background +population, e.g.: + + my @pvalueStructures = $termFinder->findTerms(genes => \@genes, + calculateFDR => 1); + + my @discardedGenes = $termFinder->discardedGenes; + + if (@discardedGenes){ + + print "The following genes were not considered in the pvalue +calculations, as they were not found in the provided background +population.\n\n", join("\n", @discardedGenes), "\n\n"; + + } + +=cut + + my ($self, %args) = @_; + + # let's check that they have provided the required information + + $self->__checkAndStoreFindTermsArgs(%args); + + # now we determine all the count for direct and indirect + # annotations for the provided list of genes. + + $self->{$kGoCounts} = $self->__buildHashRefOfAnnotations([$self->genesDatabaseIds]); + + # now we have these counts, and because we determined the counts + # of the background distribution during object construction, we + # can determine the p-values for the annotations of our list of + # genes of interest. + + $self->__calculatePValues; + + # now we want to add in which genes were annotated to each node + # so that the client can determine them + + $self->__addAnnotationsToPValues; + + # now what we want to do is calculate pvalues that are corrected + # for multiple hypothesis testing, unless it is specifically + # requested not to. + + $self->__correctPvalues unless ($self->__correctionMethod eq 'none'); + + # now calculate the False Discovery Rate, if requested to + + $self->__calculateFDR if ($self->__shouldCalculateFDR); + + return $self->__pValues; + +} + +##################################################################### +sub __checkAndStoreFindTermsArgs{ +##################################################################### +# This private method checks the arguments that are passed into the +# findTerms() method, and stores various variables internally. + + my ($self, %args) = @_; + + # check they gave us a list of genes + + if (!exists ($args{'genes'})){ + + die "You must provide a genes argument"; + + }elsif (!defined ($args{'genes'})){ + + die "Your genes argument is undefined"; + + } + + # see if they gave us an allowable method by which to correct for + # multiple hypotheses + + $self->{$kCorrectionMethod} = $args{'correction'} || 'bonferroni'; + + if (!exists $kAllowedCorrectionMethods{$self->__correctionMethod}){ + + die $self->__correctionMethod." is not an allowed correction method. Use one of :". + + join(", ", keys %kAllowedCorrectionMethods); + + } + + # store whether to calculate the FDR + + if (exists $args{'calculateFDR'} && $args{'calculateFDR'} != 0){ + + $self->{$kShouldCalculateFDR} = 1; + + }else{ + + # default is not to calculate it + + $self->{$kShouldCalculateFDR} = 0; + + } + + # what we want to do now is build up an array of identifiers that + # are unambiguous - ie databaseIds + # + # This means that when retrieving GOID's, we can always retrieve + # them by databaseId, which is unambiguous. + + my ($databaseIdsRef, $databaseId2OrigNameRef) = $self->__determineDatabaseIdsFromGenes($args{'genes'}); + + # now we want to make sure that if they provided a population as + # the background, then all of the provided genes that are being + # tested for enriched GO terms are sampled from that population + + my @discardedGenes; + + if ($self->__isUsingPopulation){ + + my @missingIds; + + # go through each databaseID, and see if it is in the databaseIDs + # associated with the GO counts for the background population. If + # it's a fake ID, then see if the original name is in the names + # that were passed in. + + foreach my $databaseId (@{$databaseIdsRef}){ + + # if it's a fake databaseId, we have to see if the orig + # name was in the provided population, otherwise, if it's + # a real databaseId, check that the databaseId is in the + # background + + if (( + + $databaseId =~ /^$kFakeIdPrefix/o && + !$self->__origNameInPopulation($databaseId2OrigNameRef->{$databaseId})) + + || + + !$self->__databaseIdIsInBackground($databaseId)){ + + push(@missingIds, $databaseId); + + } + + } + + # Now see if we have any missing names + + # If we have as many missing names as there were genes + # provided, then we'll die, as there is nothing that can be + # done, as no gene remain for any enrichment calculations + + if (@missingIds == @{$databaseIdsRef}){ + + die "ERROR: None of the genes provided for analysis are found in the background population. Please check your background list.\n"; + + } + + # Otherwise, we will print a warning that genes were + # discarded, but we also provide an API for them to retrieve + # the names of genes that were discarded. + + if (@missingIds){ + + if ($WARNINGS){ + + print STDERR "\nThe following names in the provided list of genes do not have a\n", + + "counterpart in the background population that you provided.\n", + + "These genes will not be used in the analysis for enriched GO terms.\n\n"; + + foreach my $databaseId (@missingIds){ + + print STDERR $databaseId2OrigNameRef->{$databaseId}, "\n"; + + } + + print STDERR "\n"; + + } + + # now we have to actually remove them from the list of + # considered genes + + # create a dummy hash of the databaseIds, delete the + # elements, and then assign the remaining keys back to the + # $databaseIdsRef + + # we'll also remember it + + my %dummyDatabaseIdsHash = %{$databaseId2OrigNameRef}; + + foreach my $databaseId (@missingIds){ + + push (@discardedGenes, $databaseId2OrigNameRef->{$databaseId}); + + delete $dummyDatabaseIdsHash{$databaseId}; + + } + + $databaseIdsRef = [keys %dummyDatabaseIdsHash] + + } + + } + + # now remember the genes that were discarded + + $self->__setDiscardedGenes(\@discardedGenes); + + # now store the databaseIDs for the genes that can be used to + # determine enriched GO terms in the self object + + $self->{$kDatabaseIds} = $databaseIdsRef; + + # also store the mapping of the databaseId to its original name + + $self->{$kDatabaseId2OrigName} = $databaseId2OrigNameRef; + + # note, we need to provide the client with a way of determining + # how many genes were used when calculating p-values for + # annotations + + if (scalar ($self->genesDatabaseIds) > $self->totalNumGenes){ + + if ($WARNINGS){ + + print "You have provided a list corresponding to ", scalar ($self->genesDatabaseIds), "genes, ", + + "yet you have indicated that there are only ", $self->totalNumGenes, " in the genome.\n"; + + print "No probabilities can be calculated.\n"; + + } + + return (); # simply return an empty list + + } + + + +} + +##################################################################### +sub discardedGenes { +##################################################################### +=pod + +=head2 discardedGenes + +This method returns an array of genes which were discarded from the +pvalue calculations, because they could not be found in the background +population. It should only be called after findTerms. It will either +return an empty list, if no genes were discarded, or an array of genes +that were discarded. + +Usage: + + my @pvalueStructures = $termFinder->findTerms(genes => \@genes, + calculateFDR => 1); + + my @discardedGenes = $termFinder->discardedGenes; + + if (@discardedGenes){ + + print "The following genes were not considered in the pvalue +calculations, as they were not found in the provided background +population.\n\n", join("\n", @discardedGenes), "\n\n"; + + } + +=cut + + return @{$_[0]->{$kDiscardedGenes}}; + +} + + +# +# PRIVATE INSTANCE METHODS +# + +##################################################################### +sub __databaseIdIsInBackground{ +##################################################################### +# This private method will return a Boolean to indicate whether the +# supplied databaseId is in the set of databaseIds determined for the +# background set of genes. Note, it does not check if the databaseId +# is a fake one, so the client should do that if it needs to + + return exists $_[0]->{$kBackgroundDatabaseIds}{$_[1]}; + +} + +##################################################################### +sub __isUsingPopulation{ +##################################################################### +# This private method returns a boolean to indicate whether the client +# passed in a population of genes to use as the background distribution + + return exists $_[0]->{$kArgs}{population}; + +} + +##################################################################### +sub __population{ +##################################################################### +# This private method returns a reference to an array of identifiers +# that were passed in to be used as a background population + + return $_[0]->{$kArgs}{population}; + +} + +##################################################################### +sub __origNameInPopulation{ +##################################################################### +# This private method returns a Boolean to indicate whether the +# provided name is in the list of names that were provided as a +# background population + + return exists $_[0]->{$kPopulationNamesHash}{$_[1]}; + +} + +##################################################################### +sub __setDiscardedGenes{ +##################################################################### +# This private method will store the passed in array reference, which +# points to a list of genes that had to be discarded. + + $_[0]->{$kDiscardedGenes} = $_[1]; + +} + +##################################################################### +sub __totalNumAnnotatedGenes{ +##################################################################### +# This private method returns the number of genes that have any annotation, +# as determined from the AnnotationProvider. This is set during object +# initialization. + + return $_[0]->{$kTotalNumAnnotatedGenes}; +} + + +##################################################################### +sub __numAnnotatedNodesInBackground{ +##################################################################### +# This private method returns the number of nodes in the ontology that +# have any annotation in the background distribution. This is stored +# during object initialization as a hash of GOID to the number of +# counts. + + return scalar keys %{$_[0]->{$kTotalGoNodeCounts}}; + +} + +##################################################################### +sub __allGoIdsForBackground{ +##################################################################### +# This private method returns as an array all the GOIDs of nodes in +# the ontology that have any annotation in the background +# distribution. This is stored during object initialization as a hash +# of GOID to the number of counts. + + return keys %{$_[0]->{$kTotalGoNodeCounts}}; + +} + +##################################################################### +sub genesDatabaseIds{ +##################################################################### +=pod + +=head2 genesDatabaseIds + +This method returns an array of databaseIds corresponding to the genes +that were used for the findTerms() method. Thus it allows a client to +find out how many actual entities their list of genes that were passed +in mapped to, e.g. they may have passed in the same thing with two +different names. Using this method, immediately following use of the +findTerms method, they will determine how many genes their list +collapsed to. + +=cut + + return @{$_[0]->{$kDatabaseIds}}; + +} + +##################################################################### +sub __origNameForDatabaseId{ +##################################################################### +# This method returns the original name that was provided to the term +# finder for the databaseId that it was translated to. + + return $_[0]->{$kDatabaseId2OrigName}->{$_[1]}; + +} + +##################################################################### +sub __pValues{ +##################################################################### +# This method returns an array of pValues structures + + return @{$_[0]->{$kPvalues}}; + +} + +##################################################################### +sub __correctionMethod{ +##################################################################### +# This method returns the name of the method by which the client has +# chosen to have their p-values corrected - either none, bonferroni, +# custom, or simulation. + + return $_[0]->{$kCorrectionMethod}; + +} + +##################################################################### +sub __shouldCalculateFDR{ +##################################################################### +# This method returns a boolean, to indicate whether the false discovery +# rate should be calculated + + return $_[0]->{$kShouldCalculateFDR}; + +} + +##################################################################### +sub __determineDatabaseIdsFromGenes{ +##################################################################### +# This method determines a list of databaseIds for a list of genes +# passed in by reference. It then returns a reference to that list, +# and a reference to a hash that maps the databaseIds to the +# originally supplied name +# +# If more than one gene maps to the same databaseId, then the +# databaseId is only put in the list once, and a warning is printed. +# +# If a gene does not map to a databaseId, then an undef is put in the +# list - however, if the same gene name, which does not map to a +# databaseId, is used twice then it will produce only one undef in the +# list. +# +# In addition, it removes leading and trailing whitespace from supplied +# gene names (assuming they should have none) and will skip any names that +# are either empty, or whitespace only. + + my ($self, $genesRef) = @_; + + my (@databaseIdsList, $databaseId, %databaseIds, %genes, %duplicates, %warned); + + foreach my $gene (@{$genesRef}){ + + # strip leading and trailing spaces + + $gene =~ s/^\s+//; + $gene =~ s/\s+$//; + + next if $gene eq ""; # skip empty names + + # skip and warn if we've already seen the gene + + if (exists ($genes{$gene})){ + + if ($WARNINGS && !exists($warned{$gene})){ + + print "The gene name '$gene' was used more than once.\n"; + print "It will only be considered once.\n\n"; + + $warned{$gene} = undef; + + } + + next; # just skip to the next supplied gene + + } + + # determine if the gene is ambiguous + + if ($self->__annotationProvider->nameIsAmbiguous($gene)){ + + print "$gene is an ambiguous name.\n" if $WARNINGS; + + if ($self->__annotationProvider->nameIsStandardName($gene)){ + + if ($WARNINGS){ + + print "Since $gene is used as a standard name, it will be assumed to be one.\n\n"; + + } + + # returns undef if not found or ambiguous + $databaseId = $self->__annotationProvider->databaseIdByStandardName($gene); + + if ($databaseId) { #! mark + push (@databaseIdsList, $databaseId); + } else { + print "No database ID could be found for $gene.\n\n"; + } + + }else{ + + if ($WARNINGS){ + + print "Since $gene is an ambiguous alias, it will not be used.\n\n"; + + } + + # Without a next here, we could end up popping some other + # database Id from @databaseIds, since this one was never pushed, + # and $databaseId contains a previous value. + # Do this or set $databaseId to undef at the end of the loop (or both) -- mark + next; + + } + + }else{ + + # note, if the gene has no annotation, then we will want + # to create a fake databaseId, that we can easily + # recognize, and will have to make sure that we deal with + # this later when getting annotations. + + $databaseId = $self->__annotationProvider->databaseIdByName($gene); + + # if the total number of genes is equal to the number of + # things with some annotation, then there should be no + # genes that do not return a databaseId. If this is the + # case, we will warn them. + + if (!defined $databaseId){ + + # If we've already defined the total number of genes + # with annotation, and it's equal to the number of + # genes for the background distribution, and we're not + # using a population, we'll print a warning, as under + # these circumstances we shouldn't not get a + # databaseId. + + if (defined ($self->__totalNumAnnotatedGenes) && + $self->__totalNumAnnotatedGenes == $self->totalNumGenes && + $WARNINGS && + !$self->__isUsingPopulation){ + + print "\nThe name '$gene' did not correspond to an entry from the AnnotationProvider.\n"; + print "However, the client has indicated that all genes have annotation.\n"; + print "You should probably check that '$gene' is a real name.\n\n"; + + } + + # Now we need to deal with the lack of databaseId + # We'll simply create a fake one, that we can easily + # recognize later, so we can deal with it accordingly + + $databaseId = $kFakeIdPrefix.$gene; + + } + + push (@databaseIdsList, $databaseId); + + } + + # if we have a databaseId that we've already seen, we want to + # make sure we only consider it once. + + if (defined ($databaseId) && exists($databaseIds{$databaseId})){ + + pop (@databaseIdsList); # get rid of the extra + + # and let's remember what it was, as well as the previous + # name associated with this databaseId, so we can give an + # appropriate warning + + $duplicates{$databaseId}{$gene} = undef; + $duplicates{$databaseId}{$databaseIds{$databaseId}} = undef; + + } + + # remember the databaseId and gene, in case we see them again + + $databaseIds{$databaseId} = $gene if (defined ($databaseId)); + $genes{$gene} = undef; + $databaseId = undef; # -- mark + + } + + + if (%duplicates && $WARNINGS){ + + print "The following databaseIds were represented multiple times:\n\n"; + + foreach my $duplicate (sort keys %duplicates){ + + print $duplicate, " represented by ", join(", ", (sort keys %{$duplicates{$duplicate}})), "\n"; + + } + + print "\nEach of these databaseIds will only be considered once.\n"; + + } + + # return databaseIds, and their mapping to the originally supplied + # name + + return (\@databaseIdsList, \%databaseIds); + +} + +############################################################################ +sub __buildHashRefOfAnnotations{ +############################################################################ +# This private method takes a reference to an array of databaseIds and +# calculates the level of annotations for all GO nodes that those +# databaseIds have either direct or indirect annotation for. It +# returns a reference to a hash of GO node counts, with the goids +# being the keys, and the number of annotations they have from the +# list of databaseId's being the values. + + my ($self, $databaseIdsRef) = @_; + + my %goNodeCounts; + + # keep track of how many are annotated to the aspect node + # (e.g. such as molecular function). See comments for + # __allGOIDsForDatabaseId for more information + + my $aspectNodeDirectAnnotations = 0; + + my $aspectNodeGoid = ($self->__ontologyProvider->rootNode->childNodes())[0]->goid; + + # If gene has no annotation, annotate it to the top node + # (Gene_Ontology), and its immediate child (the aspect itself) and + # the 'unannotated' node. + + my @noAnnotationNodes = ($aspectNodeGoid, + $self->__ontologyProvider->rootNode->goid, + $kUnannotatedNode->goid); + + foreach my $databaseId (@{$databaseIdsRef}) { + + # get goids count, if the databaseId is not a fake one + + my $goidsRef; + + if ($databaseId !~ /^$kFakeIdPrefix/o){ + + $goidsRef = $self->__allGOIDsForDatabaseId($databaseId); + + } + + if (!defined $goidsRef || !(@{$goidsRef})) { + + # If gene has no annotation, annotate it to the top node + # (Gene_Ontology), and its immediate child (the aspect itself) + # and the 'unannotated' node, which we cached earlier. + + $goidsRef = [@noAnnotationNodes]; + + # now cache the goids for the unnannotated genes. The + # ones that were annotated, had their goids cached in the + # __allGOIDsForDatabaseId. It is an optimization to take + # care of that there, but this here. + + $self->{$kGOIDsForDatabaseIds}->{$databaseId} = $goidsRef; + } + + # increment count for all goids appearing in @goids; + + foreach my $goid (@{$goidsRef}) { + + $goNodeCounts{$goid}++; + + } + + # keep count of how many are directly annotated to the aspect node + + if (exists ($self->{$kDirectAnnotationToAspect}{$databaseId})){ + + $aspectNodeDirectAnnotations++; + + } + + } + + # now we'd like to replace the counts for the aspect annotations, + # so that they only refer to the direct annotations, rather than + # direct and indirect annotations + + $goNodeCounts{$aspectNodeGoid} = $aspectNodeDirectAnnotations; + + return \%goNodeCounts; + +} + +############################################################################ +sub __allGOIDsForDatabaseId{ +############################################################################ +# This method returns a reference to an array of all GOIDs to which a +# databaseId is annotated, whether explicitly, or implicitly, by +# virtue of the GO node being an ancestor of an explicitly annotated +# one. The returned array contains no duplicates. + +# Because the Gene Ontology no longer has the unknown terms, then +# direct annotation to the aspect node (e.g. molecular function), +# means what annotation to the unknown terms previously meant. But, +# as all nodes are descendents of the aspect node, then enrichment for +# this node will never happen, unless we only look for enrichment of +# direct annotations to this node. Thus, in this method, we also +# record which databaseIds are directly annotated to the aspect node, which +# will be used elsewhere. + + my ($self, $databaseId) = @_; + + # cache aspect's ID, so we don't have to repeatedly retrieve it + + my $aspectId = ($self->__ontologyProvider->rootNode->childNodes())[0]->goid; # + + # generate list of GOIDs if not cached + + if (!exists($self->{$kGOIDsForDatabaseIds}->{$databaseId})) { + + my %goids; # so we keep the list unique + + # go through the direct annotations + + foreach my $goid (@{$self->__annotationProvider->goIdsByDatabaseId(databaseId => $databaseId, + aspect => $self->aspect)}){ + + # just in case an annotation is to a goid not present in the ontology + my $node = $self->__ontologyProvider->nodeFromId($goid); + + if (!$node) { + + if ($WARNINGS){ + + print STDERR "\nWarning : $goid, used to annotate $databaseId with an aspect of ".$self->aspect.", does not appear in the provided ontology.\n"; + + } + + # don't record annotations to this goid + + next; + + } + + # in case this was a seconday id, switch to primary id -- mark + $goid = $node->goid; + + # record the goid and its ancestors + + $goids{$goid} = undef; + + foreach my $ancestor ($self->__ontologyProvider->nodeFromId($goid)->ancestors){ + + $goids{$ancestor->goid} = undef; + + } + + # record in the self object if it's directly annotated to the aspectId + + if ($goid eq $aspectId){ + + $self->{$kDirectAnnotationToAspect}{$databaseId} = undef; + + } + + } + + # cache the value + + $self->{$kGOIDsForDatabaseIds}->{$databaseId} = [keys %goids]; + } + + return ($self->{$kGOIDsForDatabaseIds}->{$databaseId}); + +} + +##################################################################### +sub __calculatePValues{ +##################################################################### +# This method actually determines the p-values of the various levels +# of annotation for the particular GO nodes, and stores them within +# the object. + + my $self = shift; + + my $numDatabaseIds = scalar $self->genesDatabaseIds; + + my @pvalueArray; + + # cache so we don't have to repeatedly look it up + + my $rootGoid = $self->__ontologyProvider->rootNode->goid; + + # each node we consider here must have at least one annotation in + # our list of provided genes. + + foreach my $goid ($self->__allGoIdsForList) { + + # skip the root node, as it has to have a probability of 1, + # but don't skip its immediate child though, as we now test + # for enriched direct annotations + + next if ($goid eq $rootGoid); + + # skip any that has only one (or zero - could happen for the + # aspect goid, as we replaced its counts) annotation in the + # background distribution, as by definition these cannot be + # overrepresented + + next if ($self->__numAnnotationsToGoId($goid) <= 1); + + # if we get here, we should calculate a p-value for this node + + my $pv = $self->__processOneGOID($goid, $numDatabaseIds); + push (@pvalueArray, $pv) if ($pv); + + } + + # now sort the pvalueArray by their pValues. If the values are the same, + # then sort by goid (text based comparison). + + @pvalueArray = sort {$a->{PVALUE} <=> $b->{PVALUE} || + $a->{NODE}->goid cmp $b->{NODE}->goid } @pvalueArray; + + $self->{$kPvalues} = \@pvalueArray; + +} + +############################################################################ +sub __processOneGOID{ +############################################################################ +# This processes one GOID. It determines the number of annotations to +# the current GOID, and the P-value of that number of annotations. +# The pvalue is calculated as the probability of observing x or more +# positives in a sample on n, given that there are M positives in a +# population of N. This is calculated using the hypergeometric +# distribution. +# +# It returns a hash reference encoding that information. + + my ($self, $goid, $n) = @_; + + my $M = $self->__totalNumAnnotationsToGoId($goid); +# return undef if (!$M); # sometimes happens with "unannotated" node - mark + my $x = $self->__numAnnotationsToGoId($goid); + my $N = $self->totalNumGenes(); + + # If not using a background population, query identifiers that are not found in the annotation + # will appear in x but not M. We need to add them to M, otherwise M could be < x. With a + # background population, those query identifiers are removed before the analysis. + if ($goid eq $kUnannotatedNode->goid) { + $M += $x; + } + + my $pvalue = $self->{$kDistributions}->pValueByHypergeometric($x, $n, $M, $N); + +# my $node = $self->__ontologyProvider->nodeFromId($goid) || $kUnannotatedNode; + my $node; + $node = $self->__ontologyProvider->nodeFromId($goid); + if (!$node) { + $node = $kUnannotatedNode; + } + + my $hashRef = { + + NODE => $node, + PVALUE => $pvalue, + NUM_ANNOTATIONS => $x, + TOTAL_NUM_ANNOTATIONS => $M, + CORRECTED_PVALUE => $pvalue, # this will get changed later if a correction method is used --mark + + }; + + return $hashRef; + +} + +############################################################################ +sub __numAnnotationsToGoId{ +############################################################################ +# This private method returns the number of annotations to a +# particular GOID for the list of genes supplied to the findTerms +# method. + + my ($self, $goid) = @_; + + return $self->{$kGoCounts}->{$goid}; + +} + +############################################################################ +sub __totalNumAnnotationsToGoId{ +############################################################################ +# This returns the total number of genes that have been annotated to a +# particular GOID based on all annotations. + + my ($self, $goid) = @_; + + return $self->{$kTotalGoNodeCounts}->{$goid}; +} + +############################################################################ +sub totalNumGenes{ +############################################################################ +=pod + +=head2 totalNumGenes + +This returns the total number of genes that are in the background set +of genes from which the genes of interest were drawn. Unannotated +genes are included in this count. + +=cut + + return $_[0]->{$kArgs}{totalNumGenes}; + +} + +############################################################################ +sub __allGoIdsForList{ +############################################################################ +# This returns an array of GOIDs to which genes in the passed in gene +# list were directly or indirectly annotated. + + return keys %{$_[0]->{$kGoCounts}}; + +} + +############################################################################ +sub __correctPvalues{ +############################################################################ +# This method corrects the pvalues for multiple hypothesis testing, by +# dispatching to the appropriate method based on what method was +# requested for hypothesis correction. + + my $self = shift; + + my $correctionMethod = "__correctPvaluesBy".$self->__correctionMethod; + + $self->$correctionMethod; + +} + +##################################################################### +sub __correctPvaluesBybonferroni{ +##################################################################### +# This method corrects the p-values using a Bonferroni correction, +# where the correction factor is the total number of nodes for which +# we tested whether there was significant enrichment + + my $self = shift; + + # now correct the pvalues with the correction factor + + my $correctionFactor = scalar(@{$self->{$kPvalues}}); + + # no correction needs to be done if there is 0 or 1 hypotheses + # that were tested + + # print "Applying Bonferroni correction factor of ".$correctionFactor."\n"; + + if ($correctionFactor > 1){ + + # simply go through each hypothesis and calculate the corrected + # p-value by multiplying the uncorrected p-value by the number of + # nodes in the ontology + + foreach my $hypothesis ($self->__pValues){ + + $hypothesis->{CORRECTED_PVALUE} = $hypothesis->{PVALUE} * $correctionFactor; + + # make sure we have a ceiling of 1 + + $hypothesis->{CORRECTED_PVALUE} = 1 if ($hypothesis->{CORRECTED_PVALUE} > 1); + + } + + } + +} + +############################################################################ +sub __correctPvaluesBysimulation{ +############################################################################ +# This method corrects the P-values based on a thousand random trials, +# using the same number of genes for each trial as was used in the +# client query. A p-value will be corrected based on the number of +# simulations in which that p-value was seen, e.g. if an uncorrected +# p-value of 0.05 or better was observed in 100 of 1000 trials, the +# corrected value will be 0.1 (100/1000). + + my $self = shift; + + # when we run any simulation, any of the variables that get + # modified during the findTerms method will be trampled on - thus + # we have to save them away, and then restore them afterwards + + my $variables = $self->__saveVariables(); + + # we will need access to the real hypotheses - we'll reverse them + # for now, as it makes them easier when we use them later on + + my @realHypotheses = reverse @{$self->{$kPvalues}}; + + # now let's get the population from which we will sample genes + # randomly + + my @names = $self->__samplingPopulation; + + my $populationSize = scalar @names; + + # now get the number of genes in the original test set + # for which terms were found. + + my $numGenes = scalar $self->genesDatabaseIds; + + # now we can finally run the simulations + + my $numSimulations = 1000; + + for (my $i = 1; $i <= $numSimulations; $i++) { + + # run simulation + + my @pvals = $self->__runOneSimulation(\@names, $numGenes, $populationSize); + + # go onto a new simulation if no hypothese resulted (which is + # possible if the randomly selected genes did not have more + # than one annotation to any particular GO node) + + next if !@pvals; + + # now we look at the best pvalue for the random genes, and + # determine whether it is more significant that any of the + # p-values generated for the real genes. We will keep a count + # of how many times we see a p-value that is better than one + # calculated with the real genes, on a per simulation basis + + # if we go through the p-values for the real nodes in reverse + # order (we reversed them above), then we can quit out of the + # loop as soon as we have a p-value better than the best one + # generated from the random genes + + foreach my $realHypothesis (@realHypotheses){ + + # skip examining, if the real pvalue is better than the + # best one for the random genes + + last if ($pvals[0]->{PVALUE} > $realHypothesis->{PVALUE}); + + # if we get here, we know that this simulation has generated + # a P_VALUE that is better than the P_VALUE for the currently + # considered hypothesis. We'll simply keep count for now + + $realHypothesis->{NUM_OBSERVATIONS}++; + + } + + } + + # now we've run all the simulations, we should be able to simply divide + # the observed frequency by the number of simulations. + + foreach my $realHypothesis (@realHypotheses){ + + if (exists $realHypothesis->{NUM_OBSERVATIONS}){ + + $realHypothesis->{CORRECTED_PVALUE} = $realHypothesis->{NUM_OBSERVATIONS}/$numSimulations; + + }else{ + + # a pvalue better than this wasn't observed in any + # simulation - just record the minimum + + $realHypothesis->{CORRECTED_PVALUE} = 1/$numSimulations; + + # and say that we never saw it + + $realHypothesis->{NUM_OBSERVATIONS} = 0; + + } + + } + + @realHypotheses = reverse @realHypotheses; + + # now restore the variables + + $self->__restoreVariables($variables); + + # finally replace the hypotheses with our local copy, which we've + # made some modifications to + + $self->{$kPvalues} = \@realHypotheses; + +} + +############################################################################ +sub __saveVariables{ +############################################################################ +# This private method returns a hash containing various of the +# instance variables that might get trampled on during a simulation + + my ($self) = @_; + + my %variables; + + my @keys = ($kCorrectionMethod, $kShouldCalculateFDR, $kDatabaseIds, + $kDatabaseId2OrigName, $kGoCounts, $kPvalues, $kDiscardedGenes); + + foreach my $key (@keys){ + + $variables{$key} = $self->{$key}; + + } + + return \%variables; + +} + +############################################################################ +sub __restoreVariables{ +############################################################################ +# This private method uses a passed in hash (by reference) to restore +# variables within the instance + + my ($self, $hashRef) = @_; + + foreach my $key (%{$hashRef}){ + + $self->{$key} = $hashRef->{$key}; + + } + +} + +############################################################################ +sub __samplingPopulation{ +############################################################################ +# This private method returns an array of id's that should be used as +# the sampling population for the simulation + + my $self = shift; + + # we will need to pick genes randomly from the background + # population. Note that population may be larger than the + # databaseIds that are referenced in the annotations file - if so, + # we have to be able to randomly select unannotated genes too + + # alternatively, the user may have specified a population of genes + # that define the background - in which case we should pick only + # from that population + + my @names; + + if ($self->__isUsingPopulation){ + + @names = @{$self->__population}; + + }else{ + + # we simply use all databaseIds from the annotationProvider + + @names = $self->__annotationProvider->allDatabaseIds(); + + } + + # note the population size + + my $populationSize; + + if (! defined $self->totalNumGenes){ + + $populationSize = scalar @names; + + }else{ + + $populationSize = $self->totalNumGenes; + + } + + # now, if the population from which we should sample is bigger + # that the number of databaseIds which we have to sample from, we + # want to expand the the list of databaseIds with some fake ones, + # that correspond to unnannotated genes. + + my $numDatabaseIds = scalar @names; + + for (my $n = $numDatabaseIds; $n < $populationSize; $n++){ + + push (@names, $kFakeIdPrefix.$n); + + } + + return @names; + +} + +############################################################################ +sub __runOneSimulation{ +############################################################################ +# This method runs a single simulation of GO::TermFinder, and returns the +# generated hypotheses. It requires a reference to a list of genes that +# should be used to sample from, the number of genes that should be chosen, +# and the size of the background distribution + + my ($self, $namesRef, $numGenes, $populationSize) = @_; + + # first get a random list of genes + + my $listRef = $self->__listOfRandomGenes($namesRef, $numGenes, $populationSize); + + # now we have a list of genes, we can findTerms for them + + # however, we have to make sure that for these guys, we attempt + # no p-value correction, otherwise we will infinitely recurse, + # and make sure that we don't ask to calculate the FDR + + my @pvals = $self->findTerms(genes => $listRef, + correction => 'none', + calculateFDR => 0); + + # now return the hypotheses + + return (@pvals); + +} + +############################################################################ +sub __listOfRandomGenes{ +############################################################################ +# This private method returns a reference to an array of randomly +# chosen genes from a population that was passed in by reference + + my ($self, $namesRef, $numGenes, $populationSize) = @_; + + # create an array with as many indices as there are genes in the + # background set of genes from which those of interest were drawn + + my @indices; + + for (my $i = 0; $i < $populationSize; $i++){ + + $indices[$i] = $i; + + } + + # now sample those indices, removing sampled elements as we go. + # Use the randomly chosen index to get a random gene, and select + # as many random genes as were in the test set + + my @list; + + for (my $i = 0; $i < $numGenes; $i++) { + + my $index = int(rand(scalar(@indices))); # random number between 0 and last array index. + + my $selectedIndex = splice(@indices, $index, 1); # Remove the randomly selected element from the array. + + push(@list, $namesRef->[$selectedIndex]); + + } + + return \@list; + +} + +############################################################################ +sub __calculateFDR{ +############################################################################ +# This method calculates the false discovery rate for each hypothesis, +# such that you know if you draw your cut-off at a particular node, +# what the false discovery rate is. It does 50 simulations with +# random genes, and calculates on average the percentage of nodes that +# exceed a given value in the simulation, compared to the number that +# exceed that p-value in the real data. + + my $self = shift; + + # when we run any simulation, any of the variables that get + # modified during the findTerms method will be trampled on - thus + # we have to save them away, and then restore them afterwards + + my $variables = $self->__saveVariables(); + + # we will need access to the real hypotheses + + my @realHypotheses = @{$self->{$kPvalues}}; + + # now let's get the population from which we will sample genes + # randomly + + my @names = $self->__samplingPopulation; + + my $populationSize = scalar @names; + + # now get the number of genes in the original test set + # for which terms were found. + + my $numGenes = scalar $self->genesDatabaseIds; + + # now we can finally run the simulations + + my $numSimulations = 50; + + for (my $i = 1; $i <= $numSimulations; $i++) { + + # now run a simulation + + my @pvals = $self->__runOneSimulation(\@names, $numGenes, $populationSize); + + # go onto a new simulation if no hypotheses resulted (which is + # theoretically possible if the randomly selected genes did + # not have more than one annotation to any particular GO node) + + next if !@pvals; + + # now we look at the best pvalue for the random genes, and + # determine whether it is more significant that any of the + # p-values generated for the real genes. We will keep a count + # of how many times we see a p-value that is better than one + # calculated with the real genes, on a per simulation basis + + # if we go through the p-values for the real nodes in reverse + # order (we reversed them above), then we can quit out of the + # loop as soon as we have a p-value better than the best one + # generated from the random genes + + foreach my $realHypothesis (@realHypotheses){ + + # count the number of nodes that this simulation has + # generated a P_VALUE that is better than the P_VALUE for + # the currently considered hypothesis. + + foreach my $pval (@pvals){ + + # finish considering this real hypothesis as soon as + # we see a pvalue that is worse from the simulated + # data + + last if ($pval->{PVALUE} > $realHypothesis->{PVALUE}); + + # if we get here, our simulated pvalue must exceed the + # pvalue associated with the real hypothesis + + $realHypothesis->{FDR_OBSERVATIONS}++; + + } + + } + + } + + # now we've run all the simulations, and counted for each real + # hypothesis how many hypotheses from the simulations were better, + # we calculate on average how many were better per simulation, + # then divide by the number of hypotheses as good or better in our + # real data. We threshold this at a maximum of 1, as we can't + # have a FDR of greater than 100% + + foreach (my $i = 0; $i < @realHypotheses; $i++){ + + if (exists $realHypotheses[$i]->{FDR_OBSERVATIONS}){ + + # the rate is the average number in the simulations that + # are better than this pvalue, divided by the number that + # are better in the real data + + $realHypotheses[$i]->{FDR_OBSERVATIONS} /= $numSimulations; + + $realHypotheses[$i]->{FDR_RATE} = $realHypotheses[$i]->{FDR_OBSERVATIONS} / ($i + 1); + + if ($realHypotheses[$i]->{FDR_RATE} > 1){ + + $realHypotheses[$i]->{FDR_RATE} = 1; + + } + + }else{ + + # a pvalue better than this wasn't observed in any + # simulation - so the FDR should be 0 + + $realHypotheses[$i]->{FDR_RATE} = 0; + + # and say that we never saw it + + $realHypotheses[$i]->{FDR_OBSERVATIONS} = 0; + + } + + # now based on the FDR, and the number of hypotheses that would + # be chosen at this point, we can calculate the expected number of + # false positives, as the FDR x the number of hypotheses + + $realHypotheses[$i]->{EXPECTED_FALSE_POSITIVES} = $realHypotheses[$i]->{FDR_RATE} * ($i+1); + + } + + # now restore the variables + + $self->__restoreVariables($variables); + + # finally we want to replace our real hypotheses with our local + # copy, as we've made some changes + + $self->{$kPvalues} = \@realHypotheses; + +} + +############################################################################ +sub __addAnnotationsToPValues{ +############################################################################ +# This method looks through the annotated nodes, and adds in information +# about which genes are annotated to them, so that the client can retrieve +# that information. + + my $self = shift; + + # to do this, we can take advantage of the fact that all the + # nodes should have all their databaseIds cached, and we can + # retrieve them through the __allGOIDsForDatabaseId() method + + # first go through the annotated nodes, and simply hash the goid to the + # entry in the pValues array + + my %nodeToIndex; + + for (my $i = 0; $i < @{$self->{$kPvalues}}; $i++){ + + $nodeToIndex{$self->{$kPvalues}->[$i]->{NODE}->goid} = $i; + + } + + # now go through each databaseId, and add the information in + + foreach my $databaseId ($self->genesDatabaseIds) { + + # look at all goids for this database id + + foreach my $goid (@{$self->__allGOIDsForDatabaseId($databaseId)}){ + + next if (! exists $nodeToIndex{$goid}); # this node wasn't a hypothesis + + # if this goid was a hypothesis, we can annotate the + # corresponding hypothesis with the gene + + $self->{$kPvalues}->[$nodeToIndex{$goid}]->{ANNOTATED_GENES}->{$databaseId} = $self->__origNameForDatabaseId($databaseId); + + } + + } + +} + +############################################################################ +sub __annotationProvider{ +############################################################################ +# This private method returns the annotationProvider that was used +# during construction. + + return $_[0]->{$kArgs}{annotationProvider}; + +} + +############################################################################ +sub __ontologyProvider{ +############################################################################ +# This private methid returns the ontologyProvider that was used +# during construction. + + return $_[0]->{$kArgs}{ontologyProvider}; + +} + +############################################################################ +sub aspect{ +############################################################################ +=pod + +=head2 aspect + +Returns the aspect with the the GO::TermFinder object was constructed. + +Usage: + + my $aspect = $termFinder->aspect; + +=cut + + return $_[0]->{$kArgs}{aspect}; + +} + +1; # to make perl happy + + +__END__ + +##################################################################### +# +# Additional POD Documentation from here on down +# +##################################################################### + +=pod + +=head1 Authors + + Gavin Sherlock; sherlock@genome.stanford.edu + Elizabeth Boyle; ell@mit.edu + Ihab Awad; ihab@genome.stanford.edu + +=cut diff --git a/www/lib/GO/TermFinderReport/Html.pm b/www/lib/GO/TermFinderReport/Html.pm new file mode 100644 index 0000000..3d8c595 --- /dev/null +++ b/www/lib/GO/TermFinderReport/Html.pm @@ -0,0 +1,419 @@ +package GO::TermFinderReport::Html; + +=pod + +=head1 NAME + +GO::TermFinderReport::Html - prints an html table of the results of GO::TermFinder + +=head1 DESCRIPTION + +This print() method of this Perl module receives a reference to an the +array that is the return value from the findTerms method of +GO::TermFinder, the aspect for which terms were found, the number of +genes that were used to generate the terms, and the number of genes +that were said to be in the genome. It will then generate an html +table that summarizes those results. Optionally, filehandle, p-value +cutoff, gene URL, and GOID URL arguments may also be passed in. Url +links should have the string to indicate where the gene +name, or GOID should be put. + +=head1 SYNOPSIS + + use GO::TermFinder; + use GO::TermFinderReport::Html; + + . + . + . + + my @pvalues = $termFinder->findTerms(genes=>\@genes); + + my $report = GO::TermFinderReport::Html->new(); + + open (HTML, ">blah.html"); + + print HTML ""; + + my $numRows = $report->print(pvalues => \@pvalues, + aspect => $aspect, + numGenes => scalar(@genes), + totalNum => $totalNum, + fh => \*HTML, + cutoff => 0.01, + geneUrl => 'https://www.yeastgenome.org/locus/', + goidUrl => 'https://www.yeastgenome.org/go/GO:0000916='); + + print HTML ""; + + close HTML; + +=cut + +use strict; +use warnings; +use diagnostics; + +use vars qw ($VERSION); + +$VERSION = 0.12; + +use CGI qw/:all :html3/; + +###################################################################################### +sub new{ +###################################################################################### + +=head2 new + +This is the constructor. + +Usage: + + my $report = GO::TermFinderReport::Html->new(); + +A GO::TermFinderReport::Html object is returned. + +=cut + +###################################################################################### + + my $self = {}; + + bless $self, shift; + + return $self; + +} + +###################################################################################### +sub print{ +###################################################################################### + +=head2 print + +This method prints out the report, in the form of an html table. The +table is ordered in ascending order of p-value (i.e. most significant +first), and will print out the GO node, the frequency of use of that +node within the selected group of genes, and the population as a +whole, the corrected p-value of that, and a list of the genes +annotated to that node. If the FDR was calculated, the FDR will also +be printed. It returns the number of annotation rows in the table +that exceed the provided p-value cutoff (which may even be zero, in +which case nothing is printed). + +Usage: + + my $numRows = $report->print(pvalues => \@pvalues, + aspect => $aspect, # P, C, or F + numGenes => scalar(@genes), + totalNum => $totalNum, + fh => \*HTML, + pvalueCutOff => 0.01, + geneUrl => 'https://www.yeastgenome.org/locus/', + goidUrl => 'https://www.yeastgenome.org/go/GO:0000916'); + +Required arguments: + + pvalues : A reference to the array returned by the findTerms() method of GO::TermFinder + + aspect : The aspect of the Gene Ontology for which terms were found (C, F or P) + + numGenes : The number of genes that were in the list passed to the findTerms method + + totalNum : The total number of genes that were indicated to be in the genome for finding terms. + +Optional arguments: + + + fh : A reference to a file handle to which the table should be + printed. Defaults to standard out. + + pvalueCutOff : The p-value cutoff, above which p-values and associated + information will not be printed. Default is no cutoff. + + geneUrl : A url to which you want genes linked. Must contain the + text '', which will be replaced with the + gene name. + + goidUrl : A url to which you want the GOIDs linked. Must contain the + text '', which will be replaced with the + goid. + +=cut + +###################################################################################### + + my ($self, %args) = @_; + + if (!exists($args{'pvalues'})){ + + die "You must supply a pvalues argument to the print method."; + + } + + if (!exists($args{'aspect'})){ + + die "You must supply a aspect argument to the print method."; + + } + + + if (!exists $args{'numGenes'}){ + + die "You must supply a numGene argument to the print method."; + + } + + if (!exists $args{'totalNum'}){ + + die "You must supply a totalNum argument to the print method."; + + } + + my $pvalues = $args{'pvalues'}; + my $aspect = $args{'aspect'}; + my $numGenes = $args{'numGenes'}; + my $totalNum = $args{'totalNum'}; + my $fh = $args{'fh'} || \*STDOUT; + my $cutoff = $args{'pvalueCutOff'} || 1; + my $geneUrl = $args{'geneUrl'}; + my $goidUrl = $args{'goidUrl'}; + my $annoFile = $args{'annoFile'}; + my $correction = $args{'correction'}; # -- mark + $correction = "bonferroni" if (!$correction); + my $replacementText = ""; + my $replacementText2 = ""; # added because some sites may not allow gene name - mark + + my $rows; + my $numRows = 0; + + my $hasFdr = 0; + + foreach my $pvalue (@{$pvalues}){ + + # skip if above cutoff + + # this is the same as the uncorrected PVALUE if no correction method used -- mark + next if ($pvalue->{CORRECTED_PVALUE} > $cutoff); + + # now generate a list of loci annotated to this node, with + # links if requested + + my @loci; + + foreach my $databaseId (keys %{$pvalue->{ANNOTATED_GENES}}){ + + my $gene = $pvalue->{ANNOTATED_GENES}->{$databaseId}; + if (defined $geneUrl) { + + my $url = $geneUrl; + + $url =~ s/$replacementText/$gene/; + $url =~ s/$replacementText2/$databaseId/; + $gene = a({-href=>$url, + -target=>'infowin'}, $gene); + + } + + push (@loci, $gene); + + } + + my $loci = join(", ", @loci); + + # now calculate the frequency for annotation for the list and the genome + + my $frequencyPercent = sprintf("%.1f", ($pvalue->{NUM_ANNOTATIONS}/$numGenes) * 100); + +# my $frequency = $pvalue->{NUM_ANNOTATIONS}." out of $numGenes genes, $frequencyPercent\%"; + my $frequency = $pvalue->{NUM_ANNOTATIONS}." of $numGenes genes, $frequencyPercent\%"; + + my $geneFrequencyPercent = sprintf("%.1f", ($pvalue->{TOTAL_NUM_ANNOTATIONS}/$totalNum) * 100); + +# my $genomeFrequency = $pvalue->{TOTAL_NUM_ANNOTATIONS}." out of $totalNum genes, $geneFrequencyPercent\%"; + my $genomeFrequency = $pvalue->{TOTAL_NUM_ANNOTATIONS}." of $totalNum genes, $geneFrequencyPercent\%"; + + # now format the p-value + + my $value = $pvalue->{CORRECTED_PVALUE}; + + # if it's in scientific notation, we want up to two of the decimal places + + $value =~ s/^(.*\.[0-9]{2}).*(e.+)$/$1$2/; + + # otherwise, we'll take up to five decimal places + + $value =~ s/^(0\.[0-9]{5})[0-9]*$/$1/; + + if (defined ($pvalue->{NUM_OBSERVATIONS}) && $pvalue->{NUM_OBSERVATIONS} == 0){ + + # simulations were used to generate the corrected p-value. + # If we never saw anything better than this p-value in the + # simulations, then prepend a less than sign to the + # corrected p-value + + $value = "<".$value; + + } + + # now deal with the GOID column + + my $goColumn; + + if (defined $goidUrl){ + + my $url = $goidUrl; + my $goid = $pvalue->{NODE}->goid; + + $url =~ s/$replacementText/$goid/; + + # make link with name of term as the text + + $goColumn = a({-href=>$url, + -target=>'infowin'}, $pvalue->{NODE}->term); + + }else{ + + # if no link, just use term, and parenthetical GOID + + $goColumn = $pvalue->{NODE}->term." (".$pvalue->{NODE}->goid.")"; + + } + + # deal with FDR + + my ($fdr, $falsePositives); + + if (exists ($pvalue->{FDR_RATE})){ + + $hasFdr = 1; + + $fdr = sprintf ("%.2f%%", $pvalue->{FDR_RATE} * 100); + + $falsePositives = sprintf ("%.2f", $pvalue->{EXPECTED_FALSE_POSITIVES}); + + } + + $rows .= $self->_oneRow($goColumn, $frequency, + $genomeFrequency, $value, $loci, $fdr, + $falsePositives); + + $numRows++; + + } + + # print the table out, if there were any rows + + $self->_printTable($fh, $rows, $aspect, $annoFile, $cutoff, $hasFdr, $args{evidenceCodes}, $args{evidenceCodesNot}, $correction) if ($numRows > 0); + + return $numRows; + +} + +################################################################### +sub _oneRow{ +################################################################### +# This protected method simply returns a row from the html table, +# based on what was passed in. + + my ($self, $goColumn, $frequency, $genomeFrequency, $pvalue, + $loci, $fdr, $falsePositives) = @_; + + my $row = td($goColumn). + td($frequency). + td($genomeFrequency). + td($pvalue); + + if (defined($fdr)){ + + $row .= td($fdr).td($falsePositives); + + } + td($loci); + + $row .= td($loci); + + return Tr($row); + +} + +################################################################### +sub _printTable{ +################################################################### +# This method prints out the actual html table + + my ($self, $fh, $rows, $aspect, $annoFile, $cutoff, $hasFdr, $evidenceCodes, $evidenceCodesNot, $correction) = @_; + + $aspect =~ s/^F/Function/i; + $aspect =~ s/^P/Process/i; + $aspect =~ s/^C/Component/i; + + print $fh a({-name=>'table'}); + print $fh center(h3("Result Table")), p; + + print $fh ""; +# print $fh table({-align => 'center', +# -border => 1, +# -cellpadding => 2, +# -width => 400}, + print $fh Tr(td({-bgcolor => '#FFCC99', + -align => 'center', + -width => '100%', + -nowrap => undef}, +# b("Terms from the $aspect Ontology with p-value as good or better than $cutoff")))); + b("Terms from the $aspect Ontology ",$annoFile? "of $annoFile " : "", "with p-value <= $cutoff"))); + + if ($evidenceCodes) { + print $fh Tr(td({-bgcolor => '#CCBBAA', + -align => 'center', + -width => '100%', + -nowrap => undef}, + b("Evidence code filter included: ".join(" ", sort { $a cmp $b } keys(%{$evidenceCodes}))))); + } elsif ($evidenceCodesNot) { + print $fh Tr(td({-bgcolor => '#CCBBAA', + -align => 'center', + -width => '100%', + -nowrap => undef}, + b("Evidence codes filtered out: ".join(" ", sort { $a cmp $b } keys(%{$evidenceCodesNot}))))); + } + + my $headings = th({-align => 'center'}, "Gene Ontology term"). + th({-align => 'center'}, "Cluster frequency"). +# th({-align => 'center'}, "Genome frequency of use"). + th({-align => 'center'}, "Genome frequency"); + if ($correction ne "none") { # change column header depending on whether correction is enabled -- mark + $headings .= th({-align => 'center'}, "Corrected
P-value"); + } else { + $headings .= th({-align => 'center'}, "Uncorrected
P-value"); + } + + + if ($hasFdr){ + + $headings .= th({-align => 'center'}, "FDR"). + th({-align => 'center'}, "False
Positives"); + + } + + $headings .= th({-align => 'center'}, "Genes annotated to the term"); + + print $fh table({-align => 'center', + -border => 2}, + Tr({-bgcolor => '#CCCCFF'}, + $headings). + $rows), p; + +} + +1; # to keep Perl happy + +=pod + +=head1 AUTHOR + +Gavin Sherlock + +sherlock@genome.stanford.edu + +=cut diff --git a/www/lib/GO/TermFinderReport/Text.pm b/www/lib/GO/TermFinderReport/Text.pm new file mode 100644 index 0000000..a3ec0f4 --- /dev/null +++ b/www/lib/GO/TermFinderReport/Text.pm @@ -0,0 +1,283 @@ +package GO::TermFinderReport::Text; + +=pod + +=head1 NAME + +GO::TermFinderReport::Text - prints results of GO::TermFinder as a text report + +=head1 DESCRIPTION + +This print() method of this Perl module receives a reference to an the +array that is the return value from the findTerms method of +GO::TermFinder, the number of genes that were used to generate the +terms, and the number of genes that were said to be in the genome. It +will then generate a text report that summarizes those results. +Optionally, filehandle and p-value cutoff arguments may also be passed +in. It will return the + +=head1 SYNOPSIS + + use GO::TermFinder; + use GO::TermFinderReport::Text; + + . + . + . + + my @pvalues = $termFinder->findTerms(genes=>\@genes); + + my $report = GO::TermFinderReport::Text->new(); + + open (OUT, ">report.text"); + + my $numHypotheses = $report->print(pvalues => \@pvalues, + aspect => $aspect, + numGenes => scalar(@genes), + totalNum => $totalNum, + cutoff => 0.01, + fh => \*OUT); + + close OUT; + +=cut + +use strict; +use warnings; +use diagnostics; + +use vars qw ($VERSION); + +$VERSION = 0.10; + +###################################################################################### +sub new{ +###################################################################################### + +=head2 new + +This is the constructor. + +Usage: + + my $report = GO::TermFinderReport::Text->new(); + +A GO::TermFinderReport::Text object is returned. + +=cut + +###################################################################################### + + my $self = {}; + + bless $self, shift; + + return $self; + +} + +###################################################################################### +sub print{ +###################################################################################### + +=head2 print + +This method prints out the text report of the passed in hypotheses. +The report is ordered in ascending order of p-value (i.e. most +significant first). If the FDR was calculated, the FDR will also be +printed. It returns the number of hypotheses that had corrected +p-values as good or better than the passed in cutoff. + +Usage: + + my $numHypotheses = $report->print(pvalues => \@pvalues, + numGenes => scalar(@genes), + totalNum => $totalNum, + cutoff => 0.01, + fh => \*OUT, + table => 0 ); + +Required arguments: + +pvalues : A reference to the array returned by the findTerms() method + of GO::TermFinder + +numGenes : The number of genes that were in the list passed to the + findTerms method + +totalNum : The total number of genes that were indicated to be in the + genome for finding terms. + +Optional arguments: + +fh : A reference to a file handle to which the table should be + printed. Defaults to standard out. + +cutoff : The p-value cutoff, above which p-values and associated + information will not be printed. Default is no cutoff. + +table : 0 for standard output, 1 for tab delimited table. Default is 0 + +=cut + +###################################################################################### + + my ($self, %args) = @_; + + if (!exists($args{'pvalues'})){ + + die "You must supply a pvalues argument to the print method."; + + } + + if (!exists $args{'numGenes'}){ + + die "You must supply a numGene argument to the print method."; + + } + + if (!exists $args{'totalNum'}){ + + die "You must supply a totalNum argument to the print method."; + + } + + my $pvalues = $args{'pvalues'}; + my $numGenes = $args{'numGenes'}; + my $totalNum = $args{'totalNum'}; + my $fh = $args{'fh'} || \*STDOUT; + my $cutoff = $args{'cutoff'} || 1; + my $table = $args{'table'} || 0; + + my $correction = $args{'correction'}; # -- mark + $correction = "bonferroni" if (!$correction); + + my $rows; + my $numRows = 0; + + my $hasFdr = 0; + + my $hypothesis = 1; + + my @header = ("GOID", "TERM", "CORRECTED_PVALUE", + "UNCORRECTED_PVALUE", "NUM_LIST_ANNOTATIONS", + "LIST_SIZE", "TOTAL_NUM_ANNOTATIONS", + "POPULATION_SIZE", "FDR_RATE", + "EXPECTED_FALSE_POSITIVES", "ANNOTATED_GENES"); + + print $fh join("\t", @header), "\n" if ($table); + + foreach my $pvalue (@{$pvalues}){ + + # skip if above cutoff + + next if ($pvalue->{CORRECTED_PVALUE} > $cutoff); + + # now format the p-value + + my $value = $pvalue->{CORRECTED_PVALUE}; + + # if it's in scientific notation, we want up to two of the decimal places + + $value =~ s/^(.*\.[0-9]{2}).*(e.+)$/$1$2/; + + # otherwise, we'll take up to five decimal places + + $value =~ s/^(0\.[0-9]{5})[0-9]*$/$1/; + + if (defined ($pvalue->{NUM_OBSERVATIONS}) && $pvalue->{NUM_OBSERVATIONS} == 0){ + + # simulations were used to generate the corrected p-value. + # If we never saw anything better than this p-value in the + # simulations, then prepend a less than sign to the + # corrected p-value + + $value = "<".$value; + + } + + if (!$table){ + + print $fh + + "-- $hypothesis of ", scalar @{$pvalues}, " --\n", + "GOID\t", $pvalue->{NODE}->goid, "\n", + "TERM\t", $pvalue->{NODE}->term, "\n", + ($correction ne "none")? "CORRECTED P-VALUE\t".$pvalue->{CORRECTED_PVALUE}."\n" : "", # -- mark + "UNCORRECTED P-VALUE\t", $pvalue->{PVALUE}, "\n"; + + }else{ + + print $fh join("\t", ($pvalue->{NODE}->goid, + $pvalue->{NODE}->term, + ($correction ne "none")? $pvalue->{CORRECTED_PVALUE} : "-", + $pvalue->{PVALUE}, + $pvalue->{NUM_ANNOTATIONS}, + $numGenes, + $pvalue->{TOTAL_NUM_ANNOTATIONS}, + $totalNum)), "\t"; + + } + + # deal with FDR + + my ($fdr, $falsePositives); + + if (exists ($pvalue->{FDR_RATE})){ + + $fdr = sprintf ("%.2f%%", $pvalue->{FDR_RATE} * 100); + + $falsePositives = sprintf ("%.2f", $pvalue->{EXPECTED_FALSE_POSITIVES}); + + if(!$table){ + + print $fh + + "FDR_RATE\t", $fdr, "\n", + "EXPECTED_FALSE_POSITIVES\t", $falsePositives, "\n"; + + }else{ + + print $fh $fdr, "\t", $falsePositives, "\t"; + + } + + }else{ + + print $fh "\t\t" if ($table); # Gotta fill in the blanks + + } + + if (!$table){ + + print $fh "NUM_ANNOTATIONS\t"; + print $fh $pvalue->{NUM_ANNOTATIONS}; + print $fh " of $numGenes in the list, vs "; + print $fh $pvalue->{TOTAL_NUM_ANNOTATIONS}; + print $fh " of $totalNum in the genome\n"; + print $fh "The genes annotated to this node are:\n";; + + } + + print $fh join(", ", values(%{$pvalue->{ANNOTATED_GENES}})), "\n"; + print $fh "\n" if (!$table); + + $hypothesis++; + + } + + return ($hypothesis - 1); + +} + +1; # to keep Perl happy + +=pod + +=head1 AUTHOR + +Gavin Sherlock + +sherlock@genome.stanford.edu + +=cut diff --git a/www/lib/GO/Utils/General.pm b/www/lib/GO/Utils/General.pm new file mode 100644 index 0000000..b347046 --- /dev/null +++ b/www/lib/GO/Utils/General.pm @@ -0,0 +1,112 @@ +package GO::Utils::General; + +=head1 NAME + +GO::Utils::General - provides some general utilities for clients of other GO classes + +=head1 SYNOPSIS + + use GO::Utils::General qw (CategorizeGenes); + + CategorizeGenes(annotation => $annotation, + genes => \@genes, + ambiguous => \@ambiguous, + unambiguous => \@unambiguous, + notFound => \@notFound); + +=cut + +use strict; +use warnings; +use diagnostics; + +use vars qw (@ISA @EXPORT_OK $VERSION); +use Exporter; +@ISA = ('Exporter'); +@EXPORT_OK = qw (CategorizeGenes); + +$VERSION = 0.11; + +my @kRequiredArgs = qw (annotation genes ambiguous unambiguous notFound); + +########################################################################## +sub CategorizeGenes{ +########################################################################## + +=head2 CategorizeGenes + +CategorizeGenes categorizes a set of genes into three categories, whether they +are ambiguous, whether they are not found, or whether they are unambiguous. The +category to which they belong is determined by using an annotation provider. + +Usage: + + use GO::Utils::General qw (CategorizeGenes); + + CategorizeGenes(annotation => $annotation, + genes => \@genes, + ambiguous => \@ambiguous, + unambiguous => \@unambiguous, + notFound => \@notFound); + +All the above named arguments are required: + +annotation : A GO::AnnotationProvider concrete subclass instance +genes : A reference to an array of gene names + +The remaining arguments should be empty arrays, passed in by reference, that +will be populated by this function. + +=cut + +########################################################################## + + my (%args) = @_; + + foreach my $arg (@kRequiredArgs){ + + if (!exists $args{$arg} || !defined $args{$arg}){ + + die "You must provide a $arg argument to CategorizeGenes"; + + } + + } + + my $annotation = $args{'annotation'}; + my $genesRef = $args{'genes'}; + my $ambiguousRef = $args{'ambiguous'}; + my $unambiguousRef = $args{'unambiguous'}; + my $notFoundRef = $args{'notFound'}; + + foreach my $gene (@{$genesRef}){ + + if ($annotation->nameIsAmbiguous($gene)){ + + if ($annotation->nameIsStandardName($gene)) { + if ($annotation->databaseIdByStandardName($gene)) { + push(@{$unambiguousRef}, $gene); + next; + } + } + push(@{$ambiguousRef}, $gene); + next; + } + + my $name = $annotation->standardNameByName($gene); + + if (defined $name){ + + push(@{$unambiguousRef}, $gene); + + }else{ + + push(@{$notFoundRef}, $gene); + + } + + } + +} + +1; # keep Perl happy diff --git a/www/lib/GO/View.pm b/www/lib/GO/View.pm new file mode 100644 index 0000000..01c23aa --- /dev/null +++ b/www/lib/GO/View.pm @@ -0,0 +1,2809 @@ +package GO::View; + +######################################################################### +# Module Name : View.pm +# +# Date created : Oct. 2003 +# +# Cared for by Shuai Weng +# +# You may distribute this module under the same terms as perl itself +######################################################################### + +# POD documentation - main docs before the code + +# TODO + +# have a much better options handling set of code, probably based on Getopt::Long +# see http://www.perl.com/pub/a/2007/07/12/options-and-configuration.html + +=pod + +=head1 NAME + +GO::View - Creates a gif or png image for visualizing the GO DAG + + +=head1 DESCRIPTION + +This perl module generates a graphic that displays the parent and child +relationships of a selected GO term. It also provides the visualization +for the GO::TermFinder perl module created by the Stanford Microarray +Database (SMD). This module is useful when analyzing experimental or +computational results that produce a set of gene products that may have +a common function or process. + +=head1 SYNOPSIS + + use GO::View; + + my $goView = + + GO::View->new(-goid => $goid, + -ontologyProvider => $ontology, + -annotationProvider => $annotation, + -termFinder => \@pvalues, + -aspect => 'P', + -configFile => $confFile, + -imageDir => "/tmp", + -imageUrlRoot => "http://www.ABC.com/tmp", + -imageName => "GOview.88.png", + -tree => 'up', + -nodeUrl => $goUrl, + -geneUrl => $geneUrl, + -pvalueCutOff => '0.01', + -imageLabel => "SGD"); + + + argument required expect data and type + ------------------------------------------------------------------------- + -goid No A gene ontology ID (GOID). + If nothing is passed in, the module + will use the top goid of each ontology + branch (i.e, goid for + molecular_function, biological_process, + or cellular_component) + + -ontologyProvider Yes An ontology provider instance. + + -annotationProvider No An annotation provider instance. It is + required for creating tree for GO Term + Finder result. + + -termFinder No An array of hash references returned + from 'findTerms' method of + GO::TermFinder module. It is required + for creating tree for GO Term Finder + result. + + -aspect No . The aspect of the ontology + provider. It is required for creating + tree for GO Term Finder result. + + -configFile Yes The configuration file for setting the + general variables for the graphic + display. + + -imageDir Yes The directory for storing the newly + created image file. It must be + world (nobody) readable and writable + if you want to display the image to + the web. + + -imageUrlRoot No The url root for the -imageDir. It is + required if you want to display the + image to the web. + + -imageName No The image file name. By default, the + name will be something like + 'GOview.xxxx.png'. The 'xxxx' will be + the process id. A suffix for the image (.png + or .gif) should not be provided, as that will + be determined at run time, depending on the + capabilities of the GD library. + + -treeType No . The tree type. + + 1. up => display the ancestor tree + for the given goid. + 2. down => display the descendant tree + for the given goid. + By default, it will display the + descendant tree. + + -geneUrl No The URL for each Gene to link to. + It needs to have the text in + the url which will be substituted + by the real goid for a node. + + -nodeUrl No The url for each GO node to link to. + It needs to have the text in + the url which will be substituted + by the real goid for a node. + + -pvalueCutOff No The p-value cutoff for displaying + the graphic for GO Term Finder. + The default is 0.01 + + -imageLabel No The image label which will appear at + the left bottom corner of the map. + + -maxTopNodeToShow No This argument is used to limit the + amount of the graph that might be + shown, for the sake of reducing run- + time. The default is 6. + + ------------------------------------------------------------------------ + + To display the image on the web: + + $goView->showGraph; + + To create and return image file name with full path: + + my $imageFile = $goView->createImage; + + + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Bug reports can be submitted via email + + shuai@genome.stanford.edu + +=head1 AUTHOR + +Shuai Weng, shuai@genome.stanford.edu + +=head1 COPYRIGHT + +Copyright (c) 2003 Stanford University. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=head1 APPENDIX + +The rest of the documentation details each of the public methods. + +=cut + +use strict; +use warnings; +use GD; +use GD::Polyline; # we use polylines for edges -- mark +use GraphViz; +use IO::File; + +use GO::View::GD; + +use vars qw ($PACKAGE $VERSION); + +$PACKAGE = 'GO::View'; +$VERSION = 0.15; + +my $kReplacementText = ""; + +######################################################################### + +=head1 METHODS + +=cut + +######################################################################### +sub new { +######################################################################### + +=head2 new + + Title : new + Function: Initializes the GO::View object. + : Recognized named parameters are -goid, -ontologyProvider, + -annotationProvider, -termFinder, -aspect, -configFile, + -imageDir, -imageUrlRoot, -imageName, -treeType, -nodeUrl, + -imageLabel + Returns : a new object + Args : named parameters + +=cut + +######################################################################### + my ($class, %args) = @_; + + my $self = bless {}, $class; + + $self->_init(%args); + + return $self; + +} + +################################################################# +sub graph { +################################################################# + +=head2 graph + + Title : graph + Usage : my $graph = $goView->graph; + Function: Gets the newly created Graphviz instance. + Returns : a new Graphviz instance. + +=cut + +################################################################ + + return $_[0]->{GRAPH}; + +} + +################################################################ +sub showGraph { +################################################################ + +=head2 showGraph + + Title : showGraph + Usage : $goView->showGraph; + Function: Creates the image and print the map image to a file. + Returns : the name of the file to which the image was written + Throws : Exception if the imageUrlRoot is not passed to the object. + +=cut + +######################################################################### + my ($self) = @_; + + if ($self->graph) { + + $self->_createAndShowImage; + + } + + return $self->{IMAGE_FILE}; + +} + +######################################################################### +sub imageFile { +######################################################################### + +=head2 imageFile + + Title : imageFile + Usage : my $imageFile = $goView->imageFile; + Function: Gets the newly created image file name (with full path). + Returns : image file name. + +=cut + +######################################################################### + + my ($self) = @_; + + return $self->{IMAGE_FILE}; + + +} + +################################################################ +sub createImage { +################################################################ + +=head2 createImage + + Title : createImage + Usage : $goView->createImage; + Function: Creates the GO tree image file. Calls it only if you + want to create the image file only and do not want to + display the image. + Returns : The newly created image file name with full path. + +=cut + +######################################################################### + + my ($self) = @_; + + if ($self->graph) { + + $self->{CREATE_IMAGE_ONLY} = 1; + + return $self->_createAndShowImage; + + } + +} + +################################################################ +sub imageMap{ +################################################################ + +=head2 imageMap + + Title : imageMap + Usage : my $map = $goView->imageMap; + Function : returns the text that constitutes an image map for the + created image. + Returns : a string + +=cut + +######################################################################### + + return $_[0]->{IMAGE_MAP}; + +} + +################################################################ +sub _goid { +################################################################ +# +# =head2 _goid +# +# Title : _goid +# Usage : my $goid = $self->_goid; +# Function: Gets the goid that client interface passed in. +# Returns : GOID +# Args : none +# +# =cut +# +######################################################################### + + my ($self) = @_; + + return $self->{GOID}; + +} + +######################################################################### +sub _ontologyProvider { +######################################################################### +# +# =head2 _ontologyProvider +# +# Title : _ontologyProvider +# Usage : my $ontology = $self->_ontologyProvider; +# Function: Gets the ontology provider instance which is passed in by +# client interface. +# Returns : ontology provider instance +# Args : none +# +# =cut +# +######################################################################### + + my ($self) = @_; + + return $self->{ONTOLOGY}; + +} + +######################################################################### +sub _annotationProvider { +######################################################################### +# +# =head2 _annotationProvider +# +# Title : _annotationProvider +# Usage : my $annotation = $self->_annotationProvider; +# Function: Gets the annotation provider instance which is passed in by +# client interface. +# Returns : annotation provider instance +# Args : none +# +# =cut +# +######################################################################### + + my ($self) = @_; + + return $self->{ANNOTATION}; + +} + +######################################################################### +sub _termFinder { +######################################################################### +# +# =head2 _termFinder +# +# Title : _termFinder +# Usage : my $termFinder = $self->_termFinderProvider; +# Function: Gets the term finder result arrayref which is passed in by +# client interface. +# Returns : term finder result arrayref +# Args : none +# +# =cut +# +######################################################################### + + my ($self) = @_; + + return $self->{TERM_FINDER}; + +} + +######################################################################### +sub _init { +######################################################################### +# +# =head2 _init +# +# Title : _init +# Usage : n/a; automatically called by new() +# Function: Initializes the variables required for creating the map. +# Returns : void +# Args : n/a +# Throws : Exception if ontology provider instance or tmp image +# directory for storing the image file are not passed +# to this object. +# +# =cut +# +######################################################################### + + my ($self, %args) = @_; + + # first do some sanity checks + + if (!$args{-ontologyProvider} || !$args{-configFile} || + !$args{-imageDir}) { + + die "Can't build a $PACKAGE object without the ontologyProvider, configuration file, and tmp image directory name for storing the image file."; + + } + + $self->{ONTOLOGY} = $args{-ontologyProvider}; + + if ($args{-goid} && $args{-goid} !~ /^[0-9]+$/ && + $args{-goid} !~ /^GO:[0-9]+$/) { + + die "The goid ($args{-goid}) passed to $PACKAGE is is not in a recognized format, such as GO:0000166."; + + } + + my $goid = $args{-goid}; + + # fix format of the GOID, so it is padded preceded with GO: and properly padded + + if ($goid && $goid !~ /^GO:[0-9]+$/) { + + $goid = $self->_formatGoid($goid); + + } + + # if we have no goid passed in, then we're likely creating an + # image based upon the output from GO::TermFinder. In this case + # we just set the goid to the node corresponding to the aspect + # that is being dealt with + + if (!$goid) { + + # set top goid for the given ontology (molecular_function, + # biological_process, or cellular_component). + + $goid = $self->_initGoidFromOntology; + + } + + # now store the goid + + $self->{GOID} = $goid; + + # work out the image name and url - note that they will both + # receive a suffix later on that indicates the type of image that + # has been output (png or gif) + + $self->{IMAGE_DIR} = $args{-imageDir}; + + if ($self->{IMAGE_DIR} !~ /\/$/) { + + $self->{IMAGE_DIR} .= "/"; + + } + + my $suffix; + + if (GD::Image->can('png')){ + + $suffix = 'png'; + + }else{ + + $suffix = 'gif'; + + } + + my $imageName; + + if (exists $args{-imageName} && defined $args{-imageName} && $args{-imageName} ne ""){ + + $imageName = $args{-imageName}; + + }else{ + + my $id = $$; + + # now keep incrementing $id, until the name + # doesn't clash with an existing file + + while (-e $self->{IMAGE_DIR}."GOview.$id.$suffix"){ + + $id++; + + } + + $imageName = "GOview.$id"; + + } + + $imageName .= ".$suffix"; + + $self->{IMAGE_FILE} = $self->{IMAGE_DIR}.$imageName; + + if ($args{-imageUrlRoot}) { + + $self->{IMAGE_URL} = $args{-imageUrlRoot}; + + if ($self->{IMAGE_URL} !~ /\/$/) { $self->{IMAGE_URL} .= "/"; } + + $self->{IMAGE_URL} .= $imageName; + + }else{ + + # if we didn't get a root url, we just assume that the image will + # be in the same directory as the image + + $self->{IMAGE_URL} = "./".$imageName; + + } + + # now set the TREE_TYPE, which can be up or down. + + $self->{TREE_TYPE} = $args{-treeType} || 'down'; + + my $count; + + $self->{MAX_TOP_NODE_TO_SHOW} = $args{-maxTopNodeToShow} || 6; + + if ($args{-annotationProvider} && $args{-termFinder} && + $args{-aspect}) { + + # we're dealing with GO::TermFinder output, so we'll + # store and initialize all the information we need + + $self->{PVALUE_CUTOFF} = $args{-pvalueCutOff} || 0.01; + + #LINDA'S ADDITION: +# $self->{MAX_TOP_NODE_TO_SHOW} = $args{-maxTopNodeToShow}; + $self->{MAX_CHILD_GOID_NUM} = $args{-maxChildGoidNum}; + + $self->{ANNOTATION} = $args{-annotationProvider}; + + $self->{TERM_FINDER} = $args{-termFinder}; + + $self->{ASPECT} = $args{-aspect}; + + $count = $self->_initPvaluesGeneNamesDescendantGoids; + + }elsif ($args{-annotationProvider} || $args{-termFinder} || + $args{-aspect}) { + + die "You have to pass annotation provider and term finder instances and GO aspect ([P|F|C]) to $PACKAGE if you want to display graphic for term finder result."; + + } + + # store some more variables + + $self->{IMAGE_LABEL} = $args{-imageLabel}; + + $self->{GENE_URL} = $args{-geneUrl}; + + $self->{GO_URL} = $args{-nodeUrl}; + + $self->_initVariablesFromConfigFile($args{-configFile}); + + # only make the graph if we had at least one node passing our p value cutoff + + $self->_createGraph if ($count > 0); + + # support these new options -- mark + $self->{MAKE_PS} = $args{-makePS}; + $self->{MAKE_SVG} = $args{-makeSVG}; + +} + +################################################################ +sub _createGraph { +################################################################ +# +# =head2 _createGraph +# +# Title : _createGraph +# Usage : $self->_createGraph; +# automatically called by _init() +# Function: To create the GraphViz instance and add each descendant/ +# ancestor goid into the Graph tree. +# Returns : newly created GraphViz instance. +# +# =cut +# +######################################################################### + + my ($self) = @_; + + # If client does not ask for up (ancestor) tree and this is not + # for the special tree paths client asked for the given goid to + # its specified descendants (i.e. for GO Term Finder), then we + # need to determine how many generations of the descendants we can + # display. + + if ($self->{TREE_TYPE} !~ /up/i && + !$self->{DESCENDANT_GOID_ARRAY_REF}) { + + $self->_setGeneration($self->_goid); + + } + + # If this is for the ancestor tree, we need to determine up to + # which ancestor we want to display the tree paths. We will + # display tree path up to $self->{TOP_GOID} + + if ($self->{TREE_TYPE} =~ /up/i) { + + $self->_setTopGoid; + + if (!$self->{TOP_GOID}) { return; } + + $self->{NODE_NUM} = $self->_descendantCount($self->{TOP_GOID}, + $self->{GENERATION}); + + } + + my $goid; + + if ($self->{TOP_GOID}) { + + $goid = $self->{TOP_GOID}; + + }else { + + $goid = $self->_goid; + + } + + $self->_createGraphObject; + + my %foundNode; + my %foundEdge; + + # add the top node to the graph + + $self->_addNode($goid, + fillcolor => $self->_colorForNode($goid), + color => $self->_colorForNode($goid)); + + # and record that we've seen it + + $foundNode{$goid}++; + + # draw go_path for ancestor goid ($self->{GOID}) to each + # descendant goid in @{$self->{DESCENDANT_GOID_ARRAY_REF}}. The + # DESCENDANT_GOID_ARRAY_REF is only created if we were dealing + # with output from GO::TermFinder + + if ($self->{DESCENDANT_GOID_ARRAY_REF}) { + + # get the top node + + my $topAncestorNode = $self->_ontologyProvider->nodeFromId($self->_goid); +# my $topAncestorNode = $self->_ontologyProvider->nodeFromId($goid); #!? + + # and record its term + + $self->{TERM_HASH_REF_FOR_GOID}{$self->_goid} = $topAncestorNode->term; + + my $i; + + # now go through every GO Node to which our list of genes is + # directly annotated that contibuted to the gene count of a + # node that passed the cutoff + + foreach my $goid (@{$self->{DESCENDANT_GOID_ARRAY_REF}}) { + + my $childNode = $self->_ontologyProvider->nodeFromId($goid); + + # get the list of paths that link from this node up to the + # top - the first node in each path is the root, and the + # final node is the immediate parent of $childNode + + my @path = $childNode->pathsToAncestor($topAncestorNode); + + # now add that path to the graph + + my $found = $self->_addAncestorPathToGraph($childNode, + \@path, + \%foundNode, + \%foundEdge); + + # we can skip to the next goid if no new nodes were added + # to the graph + + next if (!$found); + + $i++; + + # now add the genes that are annotated to this node + + if ($self->{GENE_NAME_HASH_REF_FOR_GOID} && + $self->{GENE_NAME_HASH_REF_FOR_GOID}->{$goid}) { + + my $loci = $self->{GENE_NAME_HASH_REF_FOR_GOID}->{$goid}; + + $loci =~ s/:/ /g; + + $self->_addNode($loci, + fillcolor => 'grey65', + color => 'grey65', + fontcolor => 'blue'); + + $self->_addEdge($goid, $loci); + + } + + } + + return; + + } + + # draw part of the tree, and it can go up and down the tree. + + # draw up tree and only show ancestors in the paths from the given + # goid to the ancestor goid $self->{TOP_GOID} since there are too + # many nodes... + + if ($self->{TREE_TYPE} =~ /up/i && + $self->{NODE_NUM} > $self->{MAX_NODE}) { + + my $childNode = $self->_ontologyProvider->nodeFromId($self->_goid); + + my $topAncestorNode = $self->_ontologyProvider->nodeFromId($goid); + + my @path = $childNode->pathsToAncestor($topAncestorNode); + + $self->_addAncestorPathToGraph($childNode, + \@path, + \%foundNode, + \%foundEdge); + + return; + + } + + ##### draw down tree + + my $node = $self->_ontologyProvider->nodeFromId($goid); + + $self->_addChildOfTheNodeToGraph($node, + \%foundNode, + \%foundEdge); + +} + +################################################################ +sub _addChildOfTheNodeToGraph { +################################################################ +# +# =head2 _addChildOfTheNodeToGraph +# +# Title : _addChildOfTheNodeToGraph +# Usage : $self->_addChildOfTheNodeToGraph($node, +# $foundNodeHashRef, +# $foundEdgeHashRef); +# +# Function: To add each unique descendant of the given node to the +# graph tree. +# Returns : void +# +# =cut +# +######################################################################### + +# This is only called when we are dealing with a 'down' tree. It is +# not called when dealing with GO::TermFinder output + + my ($self, $node, $foundNodeHashRef, $foundEdgeHashRef, + $generation) = @_; + + if (!$generation) { $generation = 1; } + + my @childNodes = $node->childNodes; + + foreach my $childNode ($node->childNodes) { + + my $parentGoid = $node->goid; + + my $childGoid = $childNode->goid; + + if (!$foundNodeHashRef->{$parentGoid}) { + + $self->_addNode($parentGoid); + + $foundNodeHashRef->{$parentGoid}++; + + } + + if (!$foundNodeHashRef->{$childGoid}) { + + $self->_addNode($childGoid); + + $foundNodeHashRef->{$childGoid}++; + + } + if (!$foundEdgeHashRef->{$parentGoid."::".$childGoid}) { + + $self->_addEdge($parentGoid, $childGoid); + + $foundEdgeHashRef->{$parentGoid."::".$childGoid}++; + + } + + if ($generation < $self->{GENERATION}) { + + $self->_addChildOfTheNodeToGraph($childNode, + $foundNodeHashRef, + $foundEdgeHashRef, + $generation++); + } + + } + +} + +################################################################ +sub _addAncestorPathToGraph { +################################################################ +# +# =head2 _addAncestorPathToGraph +# +# Title : _addAncestorPathToGraph +# Usage : $self->_addAncestorPathToGraph($node, +# $ancestorPathArrayRef, +# $foundNodeHashRef, +# $foundEdgeHashRef); +# +# Function: To add each unique ancestor of the given node to the +# graph tree. +# Returns : void +# +# =cut +# +######################################################################### + +# This is only called when we are dealing with an 'up' tree, or when +# dealing with GO::TermFinder output + + my ($self, $childNode, $ancestorPathArrayRef, + $foundNodeHashRef, $foundEdgeHashRef) = @_; + + my $found; + + # go through each path back to the root + + foreach my $ancestorNodeArrayRef (@{$ancestorPathArrayRef}) { + + # add the child node to the path, so it gets included too + + push(@{$ancestorNodeArrayRef}, $childNode); + + # reverse the order, so that we have the child node first, and + # the root last + + @{$ancestorNodeArrayRef} = reverse(@{$ancestorNodeArrayRef}); + + # now go through the path + + for (my $i = 0; $i < @{$ancestorNodeArrayRef}; $i++) { + + my ($goid1, $goid2); + + # get the goid for the current node, and store it's term + + if (defined $ancestorNodeArrayRef->[$i]) { + + $goid1 = $ancestorNodeArrayRef->[$i]->goid; + + $self->{TERM_HASH_REF_FOR_GOID}{$goid1} + = $ancestorNodeArrayRef->[$i]->term; + + } + + # get the goid for the next node (the current node's + # parent), and store it's term too + + if (defined $ancestorNodeArrayRef->[$i+1]) { + + $goid2 = $ancestorNodeArrayRef->[$i+1]->goid; + + $self->{TERM_HASH_REF_FOR_GOID}{$goid2} + = $ancestorNodeArrayRef->[$i+1]->term; + + } + + # if the current node isn't already on the graph, add it + + if ($goid1 && !$foundNodeHashRef->{$goid1}) { + + $self->_addNode($goid1, + fillcolor => $self->_colorForNode($goid1), + color => $self->_colorForNode($goid1)); + + # record that we've seen it + + $foundNodeHashRef->{$goid1}++; + + } + + # if we have a parent, and we haven't yet recorded an edge + # between the child and parent, let's add that + + if ($goid1 && $goid2 && + !$foundEdgeHashRef->{$goid2."::".$goid1}) { + + $self->_addEdge($goid2, $goid1); + + # record that we've added the edge + + $foundEdgeHashRef->{$goid2."::".$goid1}++; + + } + + } + + # record that something has been added to the graph (as it's + # possible that we may end up finding we've added everything + + $found++; + + } + + # return whether something was added to the graph + + return $found; + +} + +################################################################ +sub _createAndShowImage { +################################################################ +# +# =head2 _createAndShowImage +# +# Title : _createAndShowImage +# Usage : $self->_createAndShowImage(); +# automatically called by showGraph() and createImage(). +# Function: To create the graph tree image. It will print the image to +# stdout if it is called by showGraph(). +# Returns : returns graphText file if the text format is changed. +# returns image file name if called by createImage(). +# +# =cut +# +######################################################################### + + my ($self) = @_; + + my ($width, $height); + + # first thing we do is get the contents of the graph in text form. + # We will then use this text to create a gif or png image, with + # various boxes etc that have the coordinates as indicated by the + # text from the graph image. + + # the actual contents of the text string will be something like: + + # digraph test { + # node [label="\N", shape=box]; + # graph [bb="0,0,662,1012"]; + # node1 [label=" biological_process\nGO:GO:0008150", pos="342,986", width="1.97", height="0.50"]; + # node2 [label="pre-replicative\ncomplex\nformation and\n maintenance\nGO:GO:0006267", pos="162,122", width="1.69", height="1.17"]; + # node3 [label="DNA-dependent\n DNA replication\nGO:GO:0006261", pos="353,226", width="1.86", height="0.72"]; + # node4 [label=" DNA replication\nGO:GO:0006260", pos="353,306", width="1.86", height="0.50"]; + # node5 [label="DNA replication\nand chromosome\n cycle\nGO:GO:0000067", pos="299,394", width="1.83", height="0.94"]; + # node6 [label=" cell cycle\nGO:GO:0007049", pos="271,482", width="1.72", height="0.50"]; + # node7 [label="cell\n proliferation\nGO:GO:0008283", pos="271,586", width="1.67", height="0.72"]; + # node8 [label="cell growth\nand/or\n maintenance\nGO:GO:0008151", pos="271,706", width="1.61", height="0.94"]; + # node9 [label="cellular\nphysiological\n process\nGO:GO:0050875", pos="272,810", width="1.67", height="0.94"]; + # node10 [label="cellular\n process\nGO:GO:0009987", pos="272,906", width="1.69", height="0.72"]; + # node11 [label="physiological\n process\nGO:GO:0007582", pos="412,906", width="1.69", height="0.72"]; + # node12 [label=" DNA metabolism\nGO:GO:0006259", pos="422,482", width="1.97", height="0.50"]; + # node13 [label="nucleobase,\nnucleoside,\nnucleotide and\nnucleic acid\n metabolism\nGO:GO:0006139", pos="421,586", width="1.64", height="1.39"]; + # node14 [label=" metabolism\nGO:GO:0008152", pos="420,706", width="1.64", height="0.50"]; + # node15 [label=" 1: MCM4 MCM3 CDC6 MCM2", pos="119,26", width="3.31", height="0.50"]; + # node16 [label=" DNA unwinding\nGO:GO:0006268", pos="353,122", width="1.92", height="0.50"]; + # node17 [label=" 2: MCM4 MCM3 MCM2", pos="353,26", width="2.69", height="0.50"]; + # node18 [label="DNA replication\n initiation\nGO:GO:0006270", pos="534,122", width="1.78", height="0.72"]; + # node19 [label=" 3: MCM4 MCM3 MCM2", pos="565,26", width="2.69", height="0.50"]; + # node5 -> node4 [pos="e,342,324 320,360 325,351 331,341 337,332"]; + # node13 -> node12 [pos="e,422,500 421,536 421,527 422,517 422,509"]; + # node12 -> node4 [pos="e,360,324 415,464 402,433 377,369 363,333"]; + # node4 -> node3 [pos="e,353,252 353,288 353,280 353,271 353,262"]; + # node3 -> node2 [pos="e,223,155 305,200 283,187 256,173 232,160"]; + # node3 -> node16 [pos="e,353,140 353,200 353,184 353,165 353,149"]; + # node3 -> node18 [pos="e,489,148 399,200 423,186 454,168 480,153"]; + # node2 -> node15 [pos="e,127,44 143,80 139,70 135,61 131,52"]; + # node16 -> node17 [pos="e,353,44 353,104 353,90 353,69 353,53"]; + # node18 -> node19 [pos="e,559,44 542,96 547,82 552,66 556,53"]; + # node6 -> node5 [pos="e,288,428 277,464 279,457 282,447 285,438"]; + # node11 -> node14 [pos="e,419,724 413,880 415,842 418,773 419,734"]; + # node11 -> node9 [pos="e,322,844 374,880 361,871 345,860 330,850"]; + # node1 -> node11 [pos="e,389,932 358,968 365,959 374,949 382,940"]; + # node1 -> node10 [pos="e,295,932 326,968 319,959 310,949 302,940"]; + # node8 -> node7 [pos="e,271,612 271,672 271,656 271,638 271,621"]; + # node14 -> node13 [pos="e,420,636 420,688 420,677 420,661 420,647"]; + # node7 -> node6 [pos="e,271,500 271,560 271,544 271,525 271,509"]; + # node10 -> node9 [pos="e,272,844 272,880 272,872 272,863 272,854"]; + # node9 -> node8 [pos="e,271,740 272,776 272,768 271,758 271,749"]; + # } + + # a description of the dot language can be found at: + # + # http://www.research.att.com/~erg/graphviz/info/lang.html + + #! why is this always done? -- mark + if (1){#defined $self->{MAKE_PS} && $self->{MAKE_PS}){ + + my $file = $self->{IMAGE_FILE}; + + $file =~ s/\.\w+$/\.ps/; + + my $fh = IO::File->new($file, q{>} )|| die "Cannot create $file : $!"; + + print $fh $self->graph->as_ps; + + $fh->close; + + } + + # support SVG output -- mark + if (defined $self->{MAKE_SVG} && $self->{MAKE_SVG}){ + my $file = $self->{IMAGE_FILE}; + $file =~ s/\.\w+$/\.svg/; + my $fh = IO::File->new($file, q{>} )|| die "Cannot create $file : $!"; + print $fh $self->graph->as_svg; + $fh->close; + } + + # hence we can determine the size of the image, the positions and sizes + # of every box, and how to draw the edges between the boxes + + my $graphText = $self->graph->as_text; +# { +# my $fh = IO::File->new("/tmp/gtf.dot", q{>} )|| die "Cannot create /tmp/gtf.dot : $!"; +# print $fh $graphText; +# $fh->close; +# } + + # the following line *may* fix reported problems when running on + # Windows, that I think are a result of dot.exe using CRLF line + # endings. + + $graphText =~ s/\015?\012/\n/g; + + # remove line continuations + $graphText =~ s/\\\n//g; + # replace any newlines not following ';' or '{' with space + $graphText =~ s/([^\;\{])\n\s*/$1 /g; +# print $graphText; + + # if we can get the height and width, we'll get them + +# if ($graphText =~ /graph \[bb=\"0,0,([0-9]+),([0-9]+)\" *\]\;/) { + if ($graphText =~ /graph \[bb=\"0,0,([0-9\.]+),([0-9\.]+)\"/) { + + $width = $1 * $self->{WIDTH_DISPLAY_RATIO}; # round up? + + $height = $2 * $self->{HEIGHT_DISPLAY_RATIO}; # round up? + + }else { + + # otherwise, we simply create a png image and we're done + + # this seems to be undocumented - I'm not sure under what + # circumstances we can't actually get the height and width + + $self->graph->as_png($self->{IMAGE_DIR}."goPath.$$.png"); + + return $self->{IMAGE_DIR}."goPath.$$.png"; + + } + + my @graphLine = split(/\n/, $graphText); + + # add borders sizes to the height and width + + my $border = 25; + + my $mapWidth = $width + 2 * $border; + + my $mapHeight = $height + 2 * $border; + + my $keyY; + + # now modify the height and width according to unclear rules.... + + if ($self->{PVALUE_HASH_REF_FOR_GOID} || !$self->{GENE_NAME_HASH_REF_FOR_GOID}) { + + $keyY = $mapHeight; + + # make the width to be at least the minimum acceptable width + + if ($mapWidth < $self->{MIN_MAP_WIDTH}) { + + $mapWidth = $self->{MIN_MAP_WIDTH}; + + } + + # modify the height + + if (!$self->{GENE_NAME_HASH_REF_FOR_GOID}) { + + # if there are no genes annotated to nodes, use some + # complex, opaque and unclear rule to change the height + + $mapHeight += int((length($self->{MAP_NOTE})*6/($mapWidth-100))*15) + 65; + + }else { + + # otherwise just add 50, the 'magic number'... + + $mapHeight += 50; + + } + + } + + # now create a GD image of the appropriate height and width + + my $gd = GO::View::GD->new(width => $mapWidth, + height => $mapHeight); + + # add a border, with a label and a date + + $self->_drawFrame($gd, $mapWidth, $mapHeight); + + my (@nodeLine, @edgeLine); + + # now go through each line of the output from the graph->as_text method + + foreach my $line (@graphLine) { + + # in case line was continuated + # sometimes the break is just before ',', in which case the extra space needs to be removed + $line =~ s/\\(\s)/$1/g; + $line =~ s/\s,/,/g; + + # now store type of entity (nodes vs edges) in different + # arrays + +# if ($line =~ / *node[0-9]+ *\[.*(label=\".+\")/i) { + if ($line =~ /^\s*node[0-9]+\s*\[/) { + + # it's a node + push(@nodeLine, $line); + +# }elsif ($line =~ / *node[0-9]+ *-> *node[0-9]+ \[pos=\"(.+)\"/i) { + }elsif ($line =~ /^\s*node[0-9]+\s*->\s*node[0-9]+\s*\[pos=\"(.+)\"/i) { + + # it's an edge + + push(@edgeLine, $1); + + } + + } + + # add the keys, which are either keys for the p-value colors, or a + # general description about GO terms and their annotations + + if (exists $self->{PVALUE_HASH_REF_FOR_GOID} && + $height > $self->{MIN_MAP_WIDTH_FOR_ONE_LINE_KEY}) { + + ### draw keys on the top of the map + if ($width < $self->{MIN_MAP_WIDTH_FOR_ONE_LINE_KEY}) { + + $self->{MOVE_Y} = 15; + + } + +# $self->_drawKeys($gd, $mapWidth, 5, 'isTop'); + + } + + if ($self->{PVALUE_HASH_REF_FOR_GOID} || + !$self->{GENE_NAME_HASH_REF_FOR_GOID}) { + + $self->_drawKeys($gd, $mapWidth, $keyY); + + } + + # now draw the actual edges and nodes + + # do the edges first + + foreach my $line (@edgeLine) { + + $self->_drawEdge($gd, $height, $border, $line); + + } + + # and now the nodes + + foreach my $line (@nodeLine) { + + $self->_drawNode($gd, $height, $border, $line); + + } + + # now output the image to a file + + my $imageFile = $self->{IMAGE_FILE}; + my $imageUrl = $self->{IMAGE_URL}; + + my $fh = IO::File->new($imageFile, q{>} )|| die "Cannot create $imageFile : $!"; + + binmode $fh; + + if ($gd->im->can('png')) { + + print $fh $gd->im->png; + + }else { + + print $fh $gd->im->gif; + + } + + $fh->close; + + if ($self->{CREATE_IMAGE_ONLY}) { + + return $imageFile; + + }else{ + + my $map = $gd->imageMap; + + if (defined ($map)){ + + $self->{IMAGE_MAP} = + + "\n". + $gd->imageMap. + "". + "

\n"; + + } + + } + +} + +###################################################################### +sub _drawNode { +###################################################################### +# +# =head2 _drawNode +# +# Title : _drawNode +# Usage : $self->_drawNode($gd, $height, $border, $line); +# automatically called by _createAndShowImage(). +# Function: To draw each node. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self, $gd, $height, $border, $line) = @_; + + # let's let's extract the information from the line, e.g. which may look like: + # + # label="MCM4 MCM3 MCM2", pos="565,26", width="2.69", height="0.50" + # + # or: + # + # label=MET28, color=grey65, fillcolor=grey65, fontcolor=blue, pos="735,561", width="0.92", height="0.50" + # + # or: + # + # label=" DNA unwinding\nGO:GO:0006268", pos="353,122", width="1.92", height="0.50" + # + # depending on whether it's genes annotating a GO node, or a box + # representing a GO Node itself + + # get the label - may or may not be surrounded in quotes + + my $label; + + if ($line =~ /label=\"([^\"]*)\"/){ + + $label = $1; + + }elsif ($line =~ /label=(.+?), /){ + + $label = $1; + + } + + my @label = split(/\\n/, $label); + + # get the width and height, and convert to number of pixels - I + # don't know where the use of the value '60' comes from. The documentation for graphviz says that + # the default scale is actually 72 pixels per inch... + + $line =~ /width=\"?([\d\.]+)/; + + my $boxW = $1 * 60 + 5; # tweak - mark + + $line =~ /height=\"?([\d\.]+)/; + + my $boxH = $1 * 55; # tweak - mark + + # remove some arbitrary amount based on the label - presumably + # this is because we're removing the GO id from the label? + + if (!$self->{MOVE_Y}) { + + $self->{MOVE_Y} = 0; + + } + + # now get the coordinates of the center of box for the node, and + # use that to work out the coordinate of the bounding box for the + # node. + + $line =~ /pos=\"([0-9\.]+),([0-9\.]+)\"/; + + my $x1 = int($1 * $self->{WIDTH_DISPLAY_RATIO}-$boxW/2 + $border); + + my $y1 = int($height - $2 * $self->{HEIGHT_DISPLAY_RATIO} - $boxH/2 + $border + $self->{MOVE_Y}); + + my $x2 = $x1 + $boxW; + + my $y2 = $y1 + $boxH; + + # Retrieve stored goid, which we now save in + # a hash, rather than in the label. -- mark + my $goid = $self->{LABEL_TO_GOID}->{$label}; + + my $geneNum; + my $totalGeneNum; + my $barColor; + my $outline; + my $linkUrl; + + # now work out the color for the box, and work out URL links + # for the nodes + + if ($self->{PVALUE_HASH_REF_FOR_GOID} && $goid) { + + # we must be dealing with GO::TermFinder output for a goid + + $barColor = $self->_getBoxColor($gd, $goid); + + if (!$self->{CREATE_IMAGE_ONLY}) { + + $linkUrl = $self->{GO_URL}; + + $linkUrl =~ s/$kReplacementText/$goid/ if $linkUrl; + + } + + }elsif ($goid) { + + # non GO::TermFinder output for a GOID + + my $node = $self->_ontologyProvider->nodeFromId($goid); + + if ($node && $node->childNodes) { + + $barColor = $gd->lightBlue; + + if (!$self->{CREATE_IMAGE_ONLY}) { + + $linkUrl = $self->{GO_URL}; + + $linkUrl =~ s/$kReplacementText/$goid/ if $linkUrl; + + } + + }else { + + # the box isn't representing a GOID, e.g. annotating genes + + $barColor = $gd->grey; + + } + + }else { + + $barColor = $gd->grey; + + } + + + my $onInfoText; + + if (( $self->{TOP_GOID} && $goid && $goid eq $self->{TOP_GOID}) || + (!$self->{TOP_GOID} && $goid && $goid eq $self->_goid)) { + + $self->_drawUpArrow($gd, $goid, ($x1+$x2)/2-7, + ($x1+$x2)/2+7, $y1-15, 10, + $linkUrl); + + } + + # now draw the box itself + $gd->drawBar(barColor => $barColor, + numX1 => $x1, + numX2 => $x2, + numY => $y1, + linkUrl => $linkUrl, + barHeight => $boxH, + outline => $outline, + onInfoText => $onInfoText); + + + # and now add the label to the box, one line at a time + + my $i = 0; + my $y3 = $y1 + int(($boxH - 12*(@label)) / 2) - 2; # center - mark + + foreach my $label (@label) { + + # skip if it's a GOID or is blank + + next if (!$label || $label =~ /^GO:/i); + + if (!$goid) { + + $label =~ s/[0-9]+://i; + + } + + my $nameColor = $gd->black; + + if (!$goid) { + + $nameColor = $gd->blue; + + }elsif ($goid eq $self->_goid) { + + $nameColor = $gd->red; + + } + + my $startPixel = int(($boxW - length($label)*6)/2); + my $numX1 = $x1 + $startPixel; +# my $numY1 = $y1 + $i*12; + my $numY1 = $y3 + $i*12; + + if ($goid) { + + # the box we're labeling is for a goid + + $gd->drawName(name => $label, + nameColor => $nameColor, + numX1 => $numX1, + numY => $numY1); + + }else { + + # we're adding in a list of genes + + my @gene = split(' ', $label); + + # add in each one - the image map being generated by + # the GO::View::GD object will have the relevant + # information added to support linking genes to it. + + # go through each gene + + foreach my $gene(@gene) { + + my $linkUrl; + + if (!$self->{CREATE_IMAGE_ONLY} && $self->{GENE_URL}) { + + $linkUrl = $self->{GENE_URL}; + + $linkUrl =~ s/$kReplacementText/$gene/; + + } + + # add the gene name + + $gd->drawName(name => $gene, + nameColor => $nameColor, + linkUrl => $linkUrl, + numX1 => $numX1, + numY => $numY1); + + $numX1 += (length($gene)+1)*6; + + } + + } + + $i++; + + } + + # I think this is supposed to say something about how many + # genes are annotated to a given node, but am not sure that + # $geneNum ever gets defined... + + if ($geneNum) { + + my $label = $geneNum." gene"; + + if ($totalGeneNum != 1) { + + $label .= "s"; + + } + + my $startPixel = int(($boxW - length($label)*6)/2); + + my $numX1 = $x1 + $startPixel+2; + + my $numY1 = $y1 + $i*10+2; + + $gd->drawName(name => $label, + nameColor => $gd->maroon, + numX1 => $numX1, + numY => $numY1); + + } + +} + +###################################################################### +sub _drawEdge { +###################################################################### +# +# =head2 _drawEdge +# +# Title : _drawEdge +# Usage : $self->_drawEdge($gd, $height, $border, $line); +# automatically called by _createAndShowImage(). +# Function: To draw each edge. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self, $gd, $height, $border, $line) = @_; + + # $line will contain something like: + # + # 342,324 320,360 325,351 331,341 337,332 + # + # where each pair of coordinates defines a point on the line. + # Thus to draw the line, we simply connect the points. + + # rather than crooked edges, draw splines -- mark + my @points = split(/ /, $line); + my $polyline = GD::Polyline->new; + my $spline; + + for (my $p = 0; $p <= $#points; $p++) { + + my $point = $points[$p]; + + my ($x, $y) = split(/\,/, $point); + + # sometimes we get trailing '\' etc. + my $x1 = $x; + my $y1 = $y; + + $x =~ s/[^\d\.]//g; + $y =~ s/[^\d\.]//g; + +# print "BEFORE '$x1', '$y1', AFTER '$x', '$y'\n" if (($x1 ne $x) || ($y1 ne $y)); + + $x *= $self->{WIDTH_DISPLAY_RATIO}; + + $x += $border; + + if (!defined $self->{MOVE_Y}) { $self->{MOVE_Y} = 0; } + +# $y = $height - $y*$self->{HEIGHT_DISPLAY_RATIO} + $border + +# $self->{MOVE_Y} + 5; + $y = $height - $y*$self->{HEIGHT_DISPLAY_RATIO} + $border + + $self->{MOVE_Y}; + + $polyline->addPt($x, $y); + } + + if ($#points % 3 == 0) { + $spline = $polyline->toSpline(); + $gd->im->polydraw($spline, $gd->black); + } else { + $gd->im->polydraw($polyline, $gd->black); + } + +# this is the original routine -- mark +# my @point = split(/ /, $line); +# +# # get rid of everything prior to the first point +# +# my ($preX, $preY); +# +# # now go through each point +# +# foreach my $point (@point) { +# +# my ($x, $y) = split(/\,/, $point); +# +# # modify the x coorfinate to take account of the border and +# # scaling factor +# +# $x *= $self->{WIDTH_DISPLAY_RATIO}; +# +# $x += $border; +# +# if (!defined $self->{MOVE_Y}) { $self->{MOVE_Y} = 0; } +# +# # modify the y coordinate based on the scaling factor, the border +# # the 'MOVE_Y' (whatever that is) and an arbitrary 5 +# +# $y = $height - $y*$self->{HEIGHT_DISPLAY_RATIO} + $border + +# $self->{MOVE_Y} + 5; +# +# # now if we have a prior x and y coordinate, we can draw a +# # line from it to the current point +# +# if ($preX && $preY) { +# +# $gd->im->line($x, $y, $preX, $preY, $gd->black); +# +# } +# +# # remember these coordinates for the next time through the loop +# +# $preX = $x; +# +# $preY = $y; +# +# } + +} + +################################################################# +sub _drawUpArrow { +################################################################# +# +# =head2 _drawUpArrow +# +# Title : _drawUpArrow +# Usage : $self->_drawUpArrow($gd, $goid, $X1, $X2, $Y, $barHeight, +# $linkUrl); +# automatically called by _drawNode(). +# Function: To draw an up arrow on the tree map. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self, $gd, $goid, $X1, $X2, $Y, $barHeight, + $linkUrl) = @_; + + my $node = $self->_ontologyProvider->nodeFromId($goid); + + my $maxGenerationUp = $node->lengthOfLongestPathToRoot; + + if ($maxGenerationUp <= 1) { return; } + + if (!$self->{CREATE_IMAGE_ONLY} && $linkUrl) { + + $linkUrl .= "&tree=up"; + + } + $gd->drawBar(barColor => $gd->blue, + numX1 => $X1, + numX2 => $X2, + numY => $Y, + linkUrl => $linkUrl, + barHeight => $barHeight, + outline => 1, + arrow => 'up'); + +} + +###################################################################### +sub _drawKeys { +###################################################################### +# +# =head2 _drawKeys +# +# Title : _drawKeys +# Usage : $self->_drawKeys($gd, $mapWidth, $keyY, $isTop); +# automatically called by _createAndShowImage(). +# Function: To draw the display keys on the top or bottom of the tree map. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self, $gd, $mapWidth, $keyY, $isTop) = @_; + + if (!$self->{GENE_NAME_HASH_REF_FOR_GOID}) { + + my $y = $keyY; + + my $boxH = 10; + + my $startX = 50; + + $gd->drawBar(barColor => $gd->lightBlue, + numX1 => $startX, + numX2 => $startX + 20, + numY => $y, + barHeight => $boxH); + + my $numX1 = $startX + 20; + + $gd->drawName(name => " = GO term with child(ren)", + nameColor => $gd->black, + numX1 => $numX1, + numY => $y-2); + + $y += 15; + + $gd->drawBar(barColor => $gd->grey, + numX1 => $startX, + numX2 => $startX + 20, + numY => $y, + barHeight => $boxH); + + $gd->drawName(name => " = GO term with no child(ren)", + nameColor => $gd->black, + numX1 => $numX1, + numY => $y-2); + + my $maxTextLen = int(($mapWidth-2*$startX)/6); + + my $text = $self->_processLabel($self->{MAP_NOTE}, $maxTextLen); + + my @geneNumExample = split(/\12/, $text); + + $y += 15; + + foreach my $text (@geneNumExample){ + + $text =~ s/^ *//; + + $gd->drawName(name => $text, + nameColor => $gd->black, + numX1 => $startX, + numY => $y-2); + + $y += 15; + + } + + return; + + } + + my $y1 = $keyY; + + my $boxH = 15; + + my $boxW = 88; + + my $startX = 48 + ($mapWidth - $boxW*6 - 35 - 48)/2; + + my $twoLine; + + if ($startX < 48) { + + $startX = 48 + ($mapWidth - $boxW*3 - 15 - 48)/2; + + $twoLine = 1; + + if (!$isTop) { $y1 -= 10; } + + } + + ####### new code + + if ($isTop) { + + #$startX = 48; + + } + + $gd->drawName(name => 'pvalue:', + nameColor => $gd->black, + numX1 => 10, + numY => $y1+1); + + my $i; + my $preX2 = $startX; + + foreach my $name ('<=1e-10', '1e-10 to 1e-8', '1e-8 to 1e-6', + '1e-6 to 1e-4', '1e-4 to 1e-2', '>0.01') { + + $i++; + + my $pvalue = $name; + + $pvalue =~ s/^<=//; + + $pvalue =~ s/^>0.01/1/; + + $pvalue =~ s/^.+ to (.+)$/$1/; + + my $barColor = $self->_color4pvalue($gd, $pvalue); + + if ($i == 4 && $twoLine) { + $preX2 = $startX; + $y1 += 20; + } + + my $x1 = $preX2 + 5; + + my $x2 = $x1 + $boxW; + + $gd->drawBar(barColor => $barColor, + numX1 => $x1, + numX2 => $x2, + numY => $y1, + barHeight => $boxH); + + my $numX1 = $x1 + ($boxW-length($name)*6)/2; + + my $numY1 = $y1 + 2; + + $gd->drawName(name => $name, + nameColor => $gd->black, + numX1 => $numX1, + numY => $numY1); + + $preX2 = $x2; + + } + +} + +###################################################################### +sub _drawFrame { +###################################################################### +# +# =head2 _drawFrame +# +# Title : _drawFrame +# Usage : $self->_drawFrame($gd, $width, $height); +# automatically called by _createAndShowImage(). +# Function: To draw a frame around the image map with date and label +# on the bottom corner. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self, $gd, $width, $height) = @_; + + $gd->drawFrameWithLabelAndDate(width => $width, + height => $height, + text => $self->{IMAGE_LABEL}); + +} + +################################################################ +sub _createGraphObject { +################################################################ +# +# =head2 _createGraphObject +# +# Title : _createGraphObject +# Usage : my $self->_createGraphObject(); +# automatically called by _createGraph(). +# Function: Gets newly created empty GraphViz instance. +# Returns : newly created empty GraphViz instance. +# +# =cut +# +######################################################################### + + my ($self) = @_; + + my %args; + + if (defined $self->{MAKE_PS} && $self->{MAKE_PS}){ + + %args = (width => 7.5, + height => 10, + pagewidth => 8.5, + pageheight => 11.5); + + } + + $self->{GRAPH} = GraphViz->new(node => { shape => 'box', + style => 'filled' }, + edge => { arrowhead => 'none' }, + overlap => 'false', + name => "GOTermFinder", +# mindist => 20, + %args); + +} + +################################################################ +sub _addNode { +################################################################ +# +# =head2 _addNode +# +# Title : _addNode +# Usage : $self->_addNode($goid); +# automatically called by _createGraph(). +# Function: Adds node to the GraphViz instance. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self, $goid, %args) = @_; + + my $label; + + my $stdGoid; # moved up in scope -- mark + + if ($goid !~ /^GO:[0-9]+$/) { + + # if the label is not a goid (i.e. it's a list of genes that + # annotate a term), we'll process it, so that there are no + # lines longer than 30 characters + + $label = $self->_processLabel($goid, 30); + + }else{ + + # otherwise, we'll get the term for the goid + + $label = $self->{TERM_HASH_REF_FOR_GOID}->{$goid}; + + if (!$label) { + + # if we didn't get a term, we'll go back to the + # ontologyProvider to get one + + my $node = $self->_ontologyProvider->nodeFromId($goid); + + $label = $node->term; + + } + + # reformat the goid + + if (!$self->{PVALUE_HASH_REF_FOR_GOID}) { + + $stdGoid = $self->_formatGoid($goid); + + }else { + + $stdGoid = $goid; + + } + + # append the goid to the processed label + + if ($label) { + +# See below for why this was changed. -- mark + $label = $self->_processLabel($label); +# $label = $self->_processLabel($label)."\n".$stdGoid; + + }else { + + $label = $stdGoid; + + } + + } + + if ($goid eq $self->_goid) { + # remove the underscore from the root term + $label =~ s/_/ /; + } + +# As seen above, previously the goid was stored in the +# node label, which then appeared in SVG output. +# Rather than storing information in the label, +# we save it in a hash. -- mark +#! this assumes all labels will be unique + my $modlabel = $label; + $modlabel =~ s/\n/\\n/g; + $self->{LABEL_TO_GOID}->{$modlabel} = $stdGoid; + + # now add the node to the graph, with the appropriate label + + # helvetica 15 closely matches gdSmallFont + $self->graph->add_node($goid, + label => $label, + fontsize => 15, + fontname => "helvetica", + %args); + + +} + +################################################################ +sub _addEdge { +################################################################ +# +# =head2 _addEdge +# +# Title : _addEdge +# Usage : $self->_addEdge($parentGoid, $childGoid); +# automatically called by _createGraph(). +# Function: Adds edge to the GraphViz instance. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self, $parentGoid, $childGoid) = @_; + + $self->graph->add_edge($parentGoid => $childGoid); + +} + +######################################################################## +sub _descendantCount { +######################################################################## +# +# =head2 _descendantCount +# +# Title : _descendantCount +# Usage : my $nodeCount = +# $self->_descendantCount($goid, $generationDown); +# automatically called by _createGraph(). +# Function: Gets total descendant node number down to a given generation. +# Returns : The total descendant node number down to a given generation. +# +# =cut +# +######################################################################### + + my ($self, $goid, $generationDown) = @_; + + my $node = $self->_ontologyProvider->nodeFromId($goid); + + my %descendantCount4generation; + + $self->_descendantCount4generation($node, \%descendantCount4generation); + + my $nodeNum = 0; + + foreach my $generation (1..$generationDown) { + + $nodeNum += $descendantCount4generation{$generation}; + + } + + return $nodeNum; + +} + +######################################################################## +sub _setGeneration { +####################################################################### +# +# =head2 _setGeneration +# +# Title : _setGeneration +# Usage : $self->_setGeneration($goid); +# automatically called by _createGraph(). +# Function: Sets the maximum generation number it will display. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self, $goid) = @_; + + my $node = $self->_ontologyProvider->nodeFromId($goid); + + my %descendantCount4generation; + + $self->_descendantCount4generation($node, \%descendantCount4generation); + + my $nodeNum = 0; + my $preNodeNum = 0; + + foreach my $generation (sort {$a<=>$b} (keys %descendantCount4generation)) { + + $nodeNum += $descendantCount4generation{$generation}; + + if ($nodeNum == $self->{MAX_NODE}) { + + $self->{GENERATION} = $generation; + + $self->{NODE_NUM} = $nodeNum; + + last; + + } + + if ($nodeNum > $self->{MAX_NODE}) { + + $self->{GENERATION} = $generation-1; + + $self->{NODE_NUM} = $preNodeNum; + + last; + + } + + $preNodeNum = $nodeNum; + + } + + if (!$self->{GENERATION} || $self->{GENERATION} < 1) { + + $self->{GENERATION} = 1; + + if (!$node->childNodes) { + + $self->{TREE_TYPE} = 'up'; + + } + + } + +} + +######################################################################## +sub _descendantCount4generation { +######################################################################## +# +# =head2 _descendantCount4generation +# +# Title : _descendantCount4generation +# Usage : $self->_descendantCount4generation($node, +# $nodeCountHashRef, +# $generation); +# automatically called by _descendantCount(), +# _setGeneration(), and itself. +# Function: Gets the descebdant count for each generation. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self, $node, $nodeCountHashRef, $generation) = @_; + + if (!$generation) { $generation = 1; } + + my @childNodes = $node->childNodes; + + $nodeCountHashRef->{$generation} += scalar(@childNodes); + + foreach my $childNode ($node->childNodes) { + + $self->_descendantCount4generation($childNode, + $nodeCountHashRef, + $generation++); + + } + +} + +################################################################ +sub _setTopGoid { +################################################################ +# +# =head2 _setTopGoid +# +# Title : _setTopGoid +# Usage : $self->_setTopGoid(); +# automatically called by _createGraph(). +# Function: Sets the top ancestor goid. We want to display the +# tree up to this goid. +# Returns : void +# +# =cut +# +######################################################################### + + my ($self) = @_; + + my $node = $self->_ontologyProvider->nodeFromId($self->_goid); + + my $maxGenerationUp = $node->lengthOfLongestPathToRoot - 1; + + my @pathsToRoot = $node->pathsToRoot; + + my %count4goid; + my %generation4goid; + + my $pathNum = scalar(@pathsToRoot); + + foreach my $path (@pathsToRoot) { + + my @nodeInPath = reverse(@{$path}); + + my $generation = 0; + + foreach my $node (@nodeInPath) { + + $count4goid{$node->goid}++; + + $generation++; + + if (!$generation4goid{$node->goid} || + $generation4goid{$node->goid} < $generation) { + + $generation4goid{$node->goid} = $generation; + + } + + if ( $pathNum == $count4goid{$node->goid} ) { + ### the same goid appears on all the paths. + + $self->{TOP_GOID} = $node->goid; + + $self->{GENERATION} = $generation4goid{$node->goid}; + + last; + + } + + } + + } + +} + +################################################################ +sub _initGoidFromOntology { +################################################################ +# +# =head2 _initGoidFromOntology +# +# Title : _initGoidFromOntology +# Usage : my $goid = $self->_initGoidFromOntology; +# automatically called by _init(). +# Function: Gets the top goid for the given ontology +# (biological_process, molecular_function, or +# cellular_component) +# Returns : goid +# +# =cut +# +######################################################################### + + my ($self) = @_; + + # gene_ontology super node + + my $rootNode = $self->_ontologyProvider->rootNode; + + # node for molecular_function, biological_process, or + # cellular_component + + my ($topNode) = $rootNode->childNodes; + + return $topNode->goid; + +} + +####################################################################### +sub _initPvaluesGeneNamesDescendantGoids { +####################################################################### +# +# =head2 _initPvaluesGeneNamesDescendantGoids +# +# Title : _initPvaluesGeneNamesDescendantGoids +# Usage : $self->_initPvaluesGeneNamesDescendantGoids; +# automatically called by _init(). +# Function: Sets $self->{PVALUE_HASH_REF_FOR_GOID}, +# $self->{GENE_NAME_HASH_REF_FOR_GOID}, +# and $self->{DESCENDANT_GOID_ARRAY_REF} +# Returns : void +# +# =cut +# +######################################################################### + +# This method reads through all of the hypotheses that came from the +# TermFinder analysis, and records various information that is +# subsequently used to construct the image. + +# TODO - naming of the $pvalue variable and _termFinder methods is +# very poor, and misleading. This should be changed. +# +# TODO - currently, when we compare the maxTopNodeToShow, we shouldn't +# increment the count when a node that is being added is the parent or +# child of a node already on the graph. + + my ($self) = @_; + + my %foundGoid; + my %foundGoidGene; + + my @directAnnotatedGoid; + my %loci4goid; + my %pvalue4goid; + + # $maxTopNodeToShow is used to limit the size of the output graph. + # The default value (set during initialization is 6, but can be + # modified via a user argument + + # LINDA'S CHANGES +# my $maxTopNodeToShow = $self->{MAX_TOP_NODE_TO_SHOW}; + my $maxChildGoidNum = $self->{MAX_CHILD_GOID_NUM}; + + my $maxTopNodeToShow = $self->{MAX_TOP_NODE_TO_SHOW}; + + my $count = 0; + + # go through each hypothesis + + foreach my $pvalue (@{$self->_termFinder}){ + + # map the goid to the corrected p-value for later retrieval + # when deciding on what color a node should have + + $pvalue4goid{$pvalue->{NODE}->goid} = $pvalue->{CORRECTED_PVALUE}; + + # skip if it has a p-value worse than our threshold note + + next if ($pvalue->{CORRECTED_PVALUE} > $self->{PVALUE_CUTOFF}); + + # skip if we've exceeded the maxTopNodeToShow value + # + # NB, we do 'next', rather than 'last', so that the mapping of the + # goid to p-value still gets recorded for any subsequent nodes, which + # ensures that their colors will be correct + + next if ($count >= $maxTopNodeToShow); + + # grab the GO::Node object for this hypothesis + + my $ancestorNode = $pvalue->{NODE}; + + # now we go through the list of genes directly annotated to + # this node and get every goid to which any of those genes are + # annotated - thus we'll build up a list of all nodes to which + # the genes are annotated. We'll also record which nodes are + # directly annotated by the genes + + foreach my $databaseId (keys %{$pvalue->{ANNOTATED_GENES}}) { + + # get every goid that the gene maps to. + + my $goidArrayRef = $self->_annotationProvider->goIdsByDatabaseId(databaseId => $databaseId, + aspect => $self->{ASPECT}); + + # get the name of the gene, as it was passed to TermFinder + + my $gene = $pvalue->{ANNOTATED_GENES}->{$databaseId}; + + # now go through each goid that the gene was annotated to + + foreach my $goid (@{$goidArrayRef}) { + + # get a GO::Node object for it + + my $node = $self->_ontologyProvider->nodeFromId($goid); + + # the following check is probably superfluous, but + # there may be cases where a node in the annotation + # provider is not in the ontology provider, so we just + # ignore them + + next if (!$node); + + # We only want to keep information about genes + # directly annotated to this node if it is a + # descendant of the '$ancestorNode', which we know + # passes the p-value cut-off, or if it is the node + # itself. In this way, we only record nodes to which + # our genes of interest are annotated if they have + # contributed to the count associated with the + # significant '$ancestorNode', and thus prune the + # tree, as we don't show all nodes to which any of the + # genes are annotated. + + next unless ($node->isADescendantOf($ancestorNode) || $goid eq $ancestorNode->goid); + + # now record some information about the this goid and the gene + + if (!exists $foundGoidGene{$goid."::".$gene}) { + + # record the genes in the list that are annotated + # to this node + + $loci4goid{$goid} .= ":".$gene; + + # and remember that we've recorded this annotation + + $foundGoidGene{$goid."::".$gene}++; + + } + + # skip if we've already seen the goid + + next if ($foundGoid{$goid}); + + # record the goid as directly annotated by a gene of interest + +# push(@directAnnotatedGoid, $goid); + # -- mark (from Linda) + # shouldn't this be if !defined(max) || (# < max)? + if ((defined($maxChildGoidNum)) && (scalar(@directAnnotatedGoid) < $maxChildGoidNum)) { + push(@directAnnotatedGoid, $goid); + } + + # and remember that we've seen it + + $foundGoid{$goid}++; + + } + + } + + $count++; # keep a count of the number of nodes that exceed the cutoff + + } + + # now record all of our information within ourselves + + $self->{DESCENDANT_GOID_ARRAY_REF} = \@directAnnotatedGoid; + $self->{PVALUE_HASH_REF_FOR_GOID} = \%pvalue4goid; + $self->{GENE_NAME_HASH_REF_FOR_GOID} = \%loci4goid; + + # and return the number of top nodes recorded, which is used to + # decide whether to print the graph or not. + + return $count; + +} + +########################################################################## +sub _initVariablesFromConfigFile { +########################################################################## + my ($self, $configFile) = @_; + + my $fh = IO::File->new($configFile, q{<}) || die "Can't open '$configFile' for reading : $!"; + + while(<$fh>) { + + chomp; + + # skip comments, blank, and whitespace only lines + + if (/^\#/ || /^\s*$/) { next; } + + my ($name, $value) = split(/=/); + + $value =~ s/^ *(.+) *$/$1/; + + if ($name =~ /^maxNode/i) { + + $self->{MAX_NODE} = $value; + + }elsif ($name =~ /^maxNodeNameWidth/i) { + + $self->{MAX_NODE_NAME_WIDTH} = $value; + + }elsif ($name =~ /^widthDisplayRatio/i) { + + $self->{WIDTH_DISPLAY_RATIO} = $value; + + }elsif ($name =~ /^heightDisplayRatio/i) { + + $self->{HEIGHT_DISPLAY_RATIO} = $value; + + }elsif ($name =~ /^minMapWidth\b/i) { # need the \b, as it is a substring of minMapWidth4OneLineKey + + $self->{MIN_MAP_WIDTH} = $value; + + }elsif ($name =~ /^minMapHeight4TopKey/i) { + + $self->{MIN_MAP_HEIGHT_FOR_TOP_KEY} = $value; + + }elsif ($name =~ /^minMapWidth4OneLineKey/i) { + + $self->{MIN_MAP_WIDTH_FOR_ONE_LINE_KEY} = $value; + + }elsif ($name =~ /^mapNote/i) { + + $self->{MAP_NOTE} = $value; + + }elsif ($name =~ /^binDir/i) { + + $ENV{PATH} .= ":".$value; + + }elsif ($name =~ /^libDir/i) { + + $ENV{LD_LIBRARY_PATH} .= ":".$value; + + }elsif ($name =~ /^makePs/i){ + + $self->{MAKE_PS} = $value; + + }elsif ($name =~ /^makeSvg/i){ # add SVG support -- mark + + $self->{MAKE_SVG} = $value; + + } + + } + + $fh->close; + +} + +################################################################ +sub _getBoxColor { +################################################################ +# +# =head2 _getBoxColor +# +# Title : _getBoxColor +# Usage : my $boxColor = $self->_getBoxColor($gd, $goid); +# automatically called by _drawNode(). +# Function: Gets the color for the node box in the display. +# Returns : gd color +# +# =cut +# +######################################################################### + + my ($self, $gd, $goid) = @_; + + if ($self->{PVALUE_HASH_REF_FOR_GOID} && + $self->{PVALUE_HASH_REF_FOR_GOID}->{$goid}) { + + return $self->_color4pvalue($gd, + $self->{PVALUE_HASH_REF_FOR_GOID}->{$goid}); + + } + + return $gd->tan; + +} + +################################################################ +sub _color4pvalue { +################################################################ +# +# =head2 _color4pvalue +# +# Title : _color4pvalue +# Usage : my $boxColor = $self->_color4pvalue($gd, $pvalue); +# automatically called by _drawKeys() and _getBoxColor(). +# Function: Gets the color for the node box in the display. +# Returns : gd color +# +# =cut +# +######################################################################### + + my ($self, $gd, $pvalue) = @_; + +#! Different packages have different ideas of what these named +#! colors are. For example, GraphViz and GD do not agree. It +#! would probably be prudent to use RGB values instead of names. +#! -- mark + + if ($pvalue <= 1e-10) { + + return $gd->orange; + + }elsif ($pvalue <= 1e-8) { + + return $gd->yellow; + + }elsif ($pvalue <= 1e-6) { + + return $gd->green4; + + }elsif ($pvalue <= 1e-4) { + + return $gd->lightBlue; + + }elsif ($pvalue <= 1e-2) { + + return $gd->blue4; + + }else { + + return $gd->tan; + + } + +} + +################################################################ +sub _colorForNode{ +################################################################ + + my ($self, $goid) = @_; + + if ($self->{PVALUE_HASH_REF_FOR_GOID} && + $self->{PVALUE_HASH_REF_FOR_GOID}->{$goid}) { + + my $pvalue = $self->{PVALUE_HASH_REF_FOR_GOID}->{$goid}; + + if ($pvalue <= 1e-10) { + + return 'orange'; + + }elsif ($pvalue <= 1e-8) { + + return 'yellow'; + + }elsif ($pvalue <= 1e-6) { + + return 'green'; + + }elsif ($pvalue <= 1e-4) { + + return 'cyan'; + + }elsif ($pvalue <= 1e-2) { + + return 'royalblue1'; + + }else { + + return 'burlywood2'; + + } + + + } + + return 'burlywood2'; + +} + +################################################################ +sub _processLabel { +################################################################ +# +# =head2 _processLabel +# +# Title : _processLabel +# Usage : my $newLabel = $self->_processLabel($label, +# $maxLabelLen); +# automatically called by _drawKeys() and _addNode(). +# Function: Splits the label into multiple lines if the label is too +# long. +# Returns : new label string +# +# =cut +# +######################################################################### + + my ($self, $label, $maxLabelLen) = @_; + + if (!$maxLabelLen) { + + $maxLabelLen = $self->{MAX_NODE_NAME_WIDTH} || 15; + + } + + # separate the label into its constituent words + + my @words = split(/ /, $label); + + my (@lines, $line); + + # go through each word + + foreach my $word (@words) { + + # if the line we're building up is too long already, or it'll + # be too long when we add the next word, add it to the array + # of lines, and start a new one + + if ((defined $line && length($line) >= $maxLabelLen) || + (defined $line && (length($line)+length($word) > $maxLabelLen)) ) { + + # get rid of leading space + + $line =~ s/^ +//; + + push (@lines, $line); + + undef $line; + + } + + # add the current word onto the line + + $line .= " ".$word; + + } + + # add in a final line if there is one + + if (defined $line){ + + $line =~ s/^ +//; + + push (@lines, $line); + + } + + return join("\n", @lines); + +} + +################################################################ +sub _formatGoid { +################################################################ +# +# =head2 _formatGoid +# +# Title : _formatGoid +# Usage : my $goid = $self->_formatGoid($goid); +# automatically called by _init() and _addNode(). +# Function: Reformats the goid (plain number) to STD GOID format +# (GO:0000388) +# Returns : std GOID +# +# =cut +# +######################################################################### + + my ($self, $goid) = @_; + + my $len = length($goid); + + for (my $i = 1; $i <= 7 - $len; $i++) { + + $goid = "0".$goid; + + } + + $goid = "GO:".$goid; + + return $goid; + +} + +####################################################################### +sub DESTROY { +####################################################################### + + # nothing needs to be done + +} + +####################################################################### +1; +####################################################################### +