From 55cf1f1c89095d1beaae4416e5d499081568b662 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 10:26:44 +0200 Subject: [PATCH 01/13] fix: bump dill for language loading to avoid AttributeError --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index c39604a1..e5ac8069 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ scikit-learn>=0.23.0 pandas>=1.0.0 torch<1.9>=1.4.0 diskcache>=5.0.3 -dill>=0.3.0 +dill>=0.3.3 selfies>=2.0.0 upfp>=0.0.5 SmilesPE>=0.0.3 diff --git a/setup.py b/setup.py index 6bbaaf4d..f40d7830 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ def get_version(rel_path): 'pandas', 'torch>=1.0.0', 'diskcache', - 'dill', + 'dill>=0.3.3', 'selfies>=2.0.0', 'upfp', 'SmilesPE>=0.0.3', From d2e96c2d3c6004e1d906ca295fae7f2d538ad3bb Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 10:36:47 +0200 Subject: [PATCH 02/13] fix: enable python<3.7 compatibility (fixes #152) --- pytoda/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytoda/types.py b/pytoda/types.py index f4b6e897..0415cd7f 100644 --- a/pytoda/types.py +++ b/pytoda/types.py @@ -1,5 +1,6 @@ """Type definitions.""" import inspect +from collections import OrderedDict # noqa from typing import ( # noqa Any, Callable, @@ -10,7 +11,6 @@ List, Mapping, Optional, - OrderedDict, Sequence, Tuple, Union, From c3e2e8c9c7c11910cfba427873fe215f2e89daf9 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 21:57:08 +0200 Subject: [PATCH 03/13] fix: 3.6 compatible numpy requirements and pin pytorch version --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index e5ac8069..ad6a1803 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -numpy>=1.20.0 +numpy>=1.19.0 scikit-learn>=0.23.0 pandas>=1.0.0 -torch<1.9>=1.4.0 +torch>=1.4.0,<1.9 diskcache>=5.0.3 dill>=0.3.3 selfies>=2.0.0 diff --git a/setup.py b/setup.py index f40d7830..1c3fd32f 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ def get_version(rel_path): 'numpy', 'scikit-learn', 'pandas', - 'torch>=1.0.0', + 'torch>=1.4.0', 'diskcache', 'dill>=0.3.3', 'selfies>=2.0.0', From 1c3cbcdedff596520246687066ecd0177e183484 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 21:59:41 +0200 Subject: [PATCH 04/13] fix: proteinsequencedataset setup (fixes #151) --- pytoda/__init__.py | 2 +- pytoda/datasets/protein_sequence_dataset.py | 33 ++++++++++----------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/pytoda/__init__.py b/pytoda/__init__.py index 6c7a7451..2bdfb4ea 100644 --- a/pytoda/__init__.py +++ b/pytoda/__init__.py @@ -1,2 +1,2 @@ name = 'pytoda' -__version__ = '1.0.0' +__version__ = '1.0.1' diff --git a/pytoda/datasets/protein_sequence_dataset.py b/pytoda/datasets/protein_sequence_dataset.py index c7e34a45..ff851d9e 100644 --- a/pytoda/datasets/protein_sequence_dataset.py +++ b/pytoda/datasets/protein_sequence_dataset.py @@ -155,10 +155,9 @@ def __init__( Defaults to False. device (torch.device): device where the tensors are stored. Defaults to gpu, if available. - iterate_dataset (bool): whether to go through all items in the - dataset to extend/build vocab, find longest sequence, and - checks the passed padding length if applicable. Defaults to - False. + iterate_dataset (bool): whether to go through all items in the dataset + to detect unknown characters, find longest sequence and checks + passed padding length if applicable. Defaults to False. backend (str): memory management backend. Defaults to eager, prefer speed over memory consumption. name (str): name of the ProteinSequenceDataset. @@ -195,7 +194,7 @@ def __init__( ), f'add_start_and_stop was "{add_start_and_stop}", but given ' f'Protein Language has {protein_language.add_start_and_stop}.' - if iterate_dataset: + if iterate_dataset or not protein_language: tokens = set(self.protein_language.token_to_index.keys()) for sequence in self.dataset: # sets max_token_sequence_length @@ -208,34 +207,32 @@ def __init__( # Set up transformation paramater self.padding = padding - self.padding_length = self.padding_length = ( + self.padding_length = ( self.protein_language.max_token_sequence_length if padding_length is None else padding_length ) + if self.padding_length < self.protein_language.max_token_sequence_length: + logger.warning( + f'WARNING: Passed padding length was {padding_length} but ' + 'protein language has padding length: ' + f'{self.protein_language.max_token_sequence_length}. ' + 'Some sequences might get truncated.' + ) self.randomize = randomize self.augment_by_revert = augment_by_revert self.device = device # Build up cascade of Protein transformations - # Below transformations are optional - _transforms = [] + transforms = [] if self.augment_by_revert: - _transforms += [AugmentByReversing()] - self.language_transforms = Compose(_transforms) + transforms += [AugmentByReversing()] + self.language_transforms = Compose(transforms.copy()) - # Run once over dataset to add missing tokens to smiles language - for index in range(len(self.dataset)): - self.protein_language.add_sequence( - self.language_transforms(self.dataset[index]) - ) - transforms = _transforms.copy() transforms += [SequenceToTokenIndexes(protein_language=self.protein_language)] if self.randomize: transforms += [Randomize()] if self.padding: - if padding_length is None: - self.padding_length = self.protein_language.max_token_sequence_length transforms += [ LeftPadding( padding_length=self.padding_length, From 0e626010e3c6c3d4d33be03c4ef30f845c93f385 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 22:18:23 +0200 Subject: [PATCH 05/13] wip: ensure that protein language is not empty --- pytoda/datasets/protein_protein_interaction_dataset.py | 8 ++++++++ pytoda/datasets/protein_sequence_dataset.py | 8 ++++++++ .../tests/test_protein_protein_interaction_dataset.py | 4 ++++ 3 files changed, 20 insertions(+) diff --git a/pytoda/datasets/protein_protein_interaction_dataset.py b/pytoda/datasets/protein_protein_interaction_dataset.py index 643a7b6e..8fc8255f 100644 --- a/pytoda/datasets/protein_protein_interaction_dataset.py +++ b/pytoda/datasets/protein_protein_interaction_dataset.py @@ -30,6 +30,7 @@ def __init__( add_start_and_stops: Union[bool, Sequence[bool]] = False, augment_by_reverts: Union[bool, Sequence[bool]] = False, randomizes: Union[bool, Sequence[bool]] = False, + iterate_datasets: Union[bool, Sequence[bool]] = False, device: torch.device = ( torch.device('cuda' if torch.cuda.is_available() else 'cpu') ), @@ -77,6 +78,10 @@ def __init__( stochastic reversion of the amino acid sequence. randomizes (Union[bool, Sequence[bool]]): perform a true randomization of the amino acid sequences. Defaults to False. + iterate_datasets (Union[bool, Sequence[bool]]): whether to go through all + items in the datasets to detect unknown characters, find longest + sequence and checks passed padding length if applicable. + Defaults to False. device (torch.device): device where the tensors are stored. Defaults to gpu, if available. """ @@ -116,6 +121,7 @@ def __init__( self.add_start_and_stops, self.augment_by_reverts, self.randomizes, + self.iterate_datasets, ) = map( ( lambda x: x @@ -128,6 +134,7 @@ def __init__( add_start_and_stops, augment_by_reverts, randomizes, + iterate_datasets, ), ) @@ -171,6 +178,7 @@ def __init__( randomize=self.randomizes[index], device=self.device, name=self.entities[index], + iterate_dataset=self.iterate_datasets[index], ) for index, filepaths in enumerate(self.sequence_filepaths) ] diff --git a/pytoda/datasets/protein_sequence_dataset.py b/pytoda/datasets/protein_sequence_dataset.py index ff851d9e..0e3b4b11 100644 --- a/pytoda/datasets/protein_sequence_dataset.py +++ b/pytoda/datasets/protein_sequence_dataset.py @@ -204,6 +204,14 @@ def __init__( logger.error( 'Found unknown token(s): %s', list(seq_tokens - tokens) ) + elif ( + not iterate_dataset + and protein_language + and protein_language.max_token_sequence_length < 3 + ): + raise ValueError( + 'If provided ProteinLanguage is empty, set iterate_dataset to True' + ) # Set up transformation paramater self.padding = padding diff --git a/pytoda/datasets/tests/test_protein_protein_interaction_dataset.py b/pytoda/datasets/tests/test_protein_protein_interaction_dataset.py index 8dd9441f..61b471a1 100644 --- a/pytoda/datasets/tests/test_protein_protein_interaction_dataset.py +++ b/pytoda/datasets/tests/test_protein_protein_interaction_dataset.py @@ -314,6 +314,7 @@ def test___getitem__(self) -> None: sequence_filetypes='.smi', annotations_column_names=[1], add_start_and_stops=s, + iterate_datasets=True, ) self.assertEqual(len(ppi_dataset), 2) @@ -345,6 +346,7 @@ def test___getitem__(self) -> None: sequence_filetypes='.smi', annotations_column_names=[1], add_start_and_stops=s, + iterate_datasets=True, ) self.assertEqual(len(ppi_dataset), 2) @@ -376,6 +378,7 @@ def test___getitem__(self) -> None: sequence_filetypes='.smi', annotations_column_names=[1], add_start_and_stops=True, + iterate_datasets=True, ) self.assertEqual(len(ppi_dataset), 2) @@ -402,6 +405,7 @@ def test___getitem__(self) -> None: sequence_filetypes='.smi', annotations_column_names=[1], add_start_and_stops=True, + iterate_datasets=True, ) self.assertEqual(len(ppi_dataset), 2) From 835aea293065c36cef83280bb9400953f0ee9280 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 22:22:57 +0200 Subject: [PATCH 06/13] ci: bump black due to click error --- dev_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev_requirements.txt b/dev_requirements.txt index b779d3c7..1e66ae5e 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -2,7 +2,7 @@ flake8==3.8.4 mypy==0.800 pytest==6.1.1 pytest-cov==2.10.1 -black==20.8b1 +black>=22.3.0 sphinx==3.4.3 sphinx-autodoc-typehints==1.11.1 better-apidoc==0.3.1 From 9c704c45a0f10bf2c3a810fbf7384f1a24de160c Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 22:25:48 +0200 Subject: [PATCH 07/13] ci: bump black due to click error --- .github/workflows/style.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index cad7d719..1e863bd6 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -52,7 +52,7 @@ jobs: python-version: "3.7" - name: Install black - run: pip install black==20.8b1 + run: pip install black==22.3.0 - name: Format run: black . --check --diff --color From bb8ebedc1165f9dcb0a7757fa0049a06156b3762 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 22:28:09 +0200 Subject: [PATCH 08/13] chore: blackening --- pytoda/datasets/_csv_statistics.py | 4 ++-- pytoda/datasets/smiles_dataset.py | 4 ++-- pytoda/smiles/transforms.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pytoda/datasets/_csv_statistics.py b/pytoda/datasets/_csv_statistics.py index d7affa22..c955ea94 100644 --- a/pytoda/datasets/_csv_statistics.py +++ b/pytoda/datasets/_csv_statistics.py @@ -235,7 +235,7 @@ def reduce_csv_statistics( std = np.sqrt( np.nansum( [ - (dataset_std ** 2 + dataset_mean ** 2) * number_of_samples + (dataset_std**2 + dataset_mean**2) * number_of_samples for dataset_std, dataset_mean, number_of_samples in zip( stds, means, sample_numbers ) @@ -243,6 +243,6 @@ def reduce_csv_statistics( axis=0, ) / total_number_of_samples - - mean ** 2 + - mean**2 ) return (features, maximum, minimum, mean, std) diff --git a/pytoda/datasets/smiles_dataset.py b/pytoda/datasets/smiles_dataset.py index f7f52944..bef62072 100644 --- a/pytoda/datasets/smiles_dataset.py +++ b/pytoda/datasets/smiles_dataset.py @@ -20,7 +20,7 @@ class SMILESDataset(DatasetDelegator): - """ Dataset of SMILES. """ + """Dataset of SMILES.""" def __init__( self, @@ -58,7 +58,7 @@ def __init__( class SMILESTokenizerDataset(DatasetDelegator): - """ Dataset of token indexes from SMILES. """ + """Dataset of token indexes from SMILES.""" def __init__( self, diff --git a/pytoda/smiles/transforms.py b/pytoda/smiles/transforms.py index fa67d3c1..28bed3af 100644 --- a/pytoda/smiles/transforms.py +++ b/pytoda/smiles/transforms.py @@ -178,7 +178,7 @@ def __call__(self, smiles: str) -> Union[Indexes, Tensor]: class RemoveIsomery(Transform): - """ Remove isomery (isotopic and chiral specifications) from SMILES """ + """Remove isomery (isotopic and chiral specifications) from SMILES""" def __init__(self, bonddir=True, chirality=True, sanitize=True) -> None: """ @@ -313,7 +313,7 @@ def __call__(self, smiles: str) -> str: class NotKekulize(Transform): - """ Transform SMILES without explicitly converting to Kekule version """ + """Transform SMILES without explicitly converting to Kekule version""" def __init__(self, all_bonds_explicit=False, all_hs_explicit=False, sanitize=True): self.all_bonds_explicit = all_bonds_explicit @@ -357,7 +357,7 @@ def __init__( sanitize: bool = True, seed: int = -1, ) -> None: - """ NOTE: These parameter need to be passed down to the enumerator.""" + """NOTE: These parameter need to be passed down to the enumerator.""" self.kekule_smiles = kekule_smiles self.all_bonds_explicit = all_bonds_explicit @@ -534,7 +534,7 @@ def __call__tensor(self, smiles_numerical: Tensor) -> torch.Tensor: class Selfies(Transform): - """ Convert a molecule from SMILES to SELFIES. """ + """Convert a molecule from SMILES to SELFIES.""" def __call__(self, smiles: str) -> str: return selfies_encoder(smiles) From 2a35c025178ed2ac15b3b2d9185a7937e58159e4 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 22:35:41 +0200 Subject: [PATCH 09/13] doc: remove code quality checks [skip ci] --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index b87abb86..17addef2 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,6 @@ [![PyPI version](https://badge.fury.io/py/pytoda.svg)](https://badge.fury.io/py/pytoda) [![build](https://github.com/PaccMann/paccmann_datasets/workflows/build/badge.svg)](https://github.com/PaccMann/paccmann_datasets/actions) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![Code quality](https://api.codiga.io/project/22043/score/svg)](https://codiga.io/) -[![Code grade](https://api.codiga.io/project/22043/status/svg)](https://codiga.io/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Downloads](https://pepy.tech/badge/pytoda)](https://pepy.tech/project/pytoda) [![Downloads](https://pepy.tech/badge/pytoda/month)](https://pepy.tech/project/pytoda) From e1382383860be755cba45c504720852dcd7206b0 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 22:58:08 +0200 Subject: [PATCH 10/13] ci: trigger CI --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 17addef2..41a4da75 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# pytoda +# PyToDa [![PyPI version](https://badge.fury.io/py/pytoda.svg)](https://badge.fury.io/py/pytoda) [![build](https://github.com/PaccMann/paccmann_datasets/workflows/build/badge.svg)](https://github.com/PaccMann/paccmann_datasets/actions) From 50f3ce17739d9d8053f91af1216b63a914f43dca Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Fri, 15 Apr 2022 23:00:35 +0200 Subject: [PATCH 11/13] ci: update black --- .github/workflows/suggest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/suggest.yml b/.github/workflows/suggest.yml index 8132e244..e847938e 100644 --- a/.github/workflows/suggest.yml +++ b/.github/workflows/suggest.yml @@ -15,7 +15,7 @@ jobs: python-version: "3.7" - name: Install black - run: pip install black==20.8b1 + run: pip install black==22.3.0 - name: Format run: black . From 74271a448b0069ab534e4ea165782ab3bea4ee9c Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Sat, 16 Apr 2022 11:51:09 +0200 Subject: [PATCH 12/13] fix: resolve bug with python version in language loading via dill --- pytoda/__init__.py | 2 +- pytoda/proteins/protein_language.py | 13 +++++++++++-- pytoda/smiles/smiles_language.py | 13 +++++++++++-- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/pytoda/__init__.py b/pytoda/__init__.py index 2bdfb4ea..8b7242d5 100644 --- a/pytoda/__init__.py +++ b/pytoda/__init__.py @@ -1,2 +1,2 @@ name = 'pytoda' -__version__ = '1.0.1' +__version__ = '1.0.2' diff --git a/pytoda/proteins/protein_language.py b/pytoda/proteins/protein_language.py index 3d800fd7..f62dd9ff 100644 --- a/pytoda/proteins/protein_language.py +++ b/pytoda/proteins/protein_language.py @@ -107,8 +107,17 @@ def load(filepath: str) -> 'ProteinLanguage': Returns: ProteinLanguage: the loaded Protein language object. """ - with open(filepath, 'rb') as f: - protein_language = dill.load(f) + try: + with open(filepath, 'rb') as f: + protein_language = dill.load(f) + except TypeError: + # Necessary to load python3.7 pickled objects with >=3.8 + # For details see: https://github.com/uqfoundation/dill/pull/406 + storage = dill._dill._reverse_typemap['CodeType'] + dill._dill._reverse_typemap['CodeType'] = dill._dill._create_code + with open(filepath, 'rb') as f: + protein_language = dill.load(f) + dill._dill._reverse_typemap['CodeType'] = storage return protein_language @staticmethod diff --git a/pytoda/smiles/smiles_language.py b/pytoda/smiles/smiles_language.py index 03ca3b13..628477d8 100644 --- a/pytoda/smiles/smiles_language.py +++ b/pytoda/smiles/smiles_language.py @@ -176,8 +176,17 @@ def load(filepath: str) -> 'SMILESLanguage': warnings.warn( "Loading languages will use a text files in the future", FutureWarning ) - with open(filepath, 'rb') as f: - smiles_language = dill.load(f) + try: + with open(filepath, 'rb') as f: + smiles_language = dill.load(f) + except TypeError: + # Necessary to load python3.7 pickled objects with >=3.8: + # For details see: https://github.com/uqfoundation/dill/pull/406 + storage = dill._dill._reverse_typemap['CodeType'] + dill._dill._reverse_typemap['CodeType'] = dill._dill._create_code + with open(filepath, 'rb') as f: + smiles_language = dill.load(f) + dill._dill._reverse_typemap['CodeType'] = storage return smiles_language @staticmethod From 43e2ad25fba095b0fe82f2158002b73654492058 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Sat, 16 Apr 2022 12:01:18 +0200 Subject: [PATCH 13/13] ci: add newline --- pytoda/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytoda/__init__.py b/pytoda/__init__.py index ef551dc2..8b7242d5 100644 --- a/pytoda/__init__.py +++ b/pytoda/__init__.py @@ -1,2 +1,2 @@ name = 'pytoda' -__version__ = '1.0.2' \ No newline at end of file +__version__ = '1.0.2'