Skip to content

Commit

Permalink
Upgrade qcrit version, address pylint warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
timgianitsos committed Dec 18, 2019
1 parent 5b6f902 commit 6db0043
Show file tree
Hide file tree
Showing 9 changed files with 225 additions and 153 deletions.
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -572,4 +572,4 @@ overgeneral-exceptions=BaseException,

[IGNORE WARNINGS]

errors-only=yes
# errors-only=yes
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ sklearn = "*"
numpy = "*"
scipy = "*"
nltk = "*"
qcrit = "==0.0.9"
qcrit = "==0.0.13"
tqdm = "*"

[dev-packages]
Expand Down
214 changes: 118 additions & 96 deletions Pipfile.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions corpus_categories.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# pylint: disable = C
from collections import OrderedDict

#These files are composites of files that already exist in parts in the tesserae corpus
Expand Down
1 change: 1 addition & 0 deletions greek_features.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# pylint: disable = missing-function-docstring
'''
Greek features
'''
Expand Down
120 changes: 80 additions & 40 deletions ml_analyzers.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,31 @@
#TODO display files tested?
# pylint: disable = trailing-whitespace, C0330, unused-argument
'''
Analyzers for a single time period
'''


import warnings
warnings.filterwarnings('ignore') #TODO consider whether to keep
from tqdm import tqdm
import numpy as np
from functools import reduce
import statistics
from collections import Counter
import warnings

from sklearn.exceptions import UndefinedMetricWarning
import numpy as np
from tqdm import tqdm
import sklearn
from sklearn import svm, neural_network, naive_bayes, ensemble, neighbors
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from collections import Counter
from qcrit.color import RED, GREEN, YELLOW, PURPLE, RESET
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from qcrit.model_analyzer import model_analyzer


#Ignores warning for undefined F1-score when a category is never predicted.
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

RED = '\033[91m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
PURPLE = '\033[95m'
RESET = '\033[0m'

def _display_stats(expected, results, file_names, labels_key, tabs=0):
assert len(expected) == len(results)

Expand Down Expand Up @@ -51,7 +63,7 @@ def _display_stats(expected, results, file_names, labels_key, tabs=0):
# print(RED + 'Random Forest tests' + RESET)

# features_train, features_test, labels_train, labels_test = train_test_split(data, target, test_size=0.4, random_state=0)
# clf = ensemble.RandomForestClassifier(random_state=0)
# clf = ensemble.RandomForestClassifier(random_state=0, n_estimators=10)
# clf.fit(features_train, labels_train)
# results = clf.predict(features_test)
# expected = labels_test
Expand All @@ -63,7 +75,7 @@ def _display_stats(expected, results, file_names, labels_key, tabs=0):
@model_analyzer()
def random_forest_cross_validation(data, target, file_names, feature_names, labels_key):
print(RED + 'Random Forest cross validation' + RESET)
clf = ensemble.RandomForestClassifier(random_state=0)
clf = ensemble.RandomForestClassifier(random_state=0, n_estimators=10)
splitter = StratifiedKFold(n_splits=5, shuffle=False, random_state=0)
tabs = 1

Expand Down Expand Up @@ -248,38 +260,66 @@ def random_forest_feature_rankings(data, target, file_names, feature_names, labe
for t in sorted([(feat, rank) for feat, rank in feature_rankings.items()], key=lambda s: -1 * s[1].mean()):
print('\t' + '%.6f +/- standard deviation of %.4f' % (t[1].mean(), t[1].std()) + ': ' + t[0])

# @model_analyzer()
# def sample_classifiers(data, target, file_names, feature_names, labels_key):
# #Includes all the machine learning classifiers
# classifiers = [
# ensemble.RandomForestClassifier(random_state=0),
# svm.SVC(gamma=0.00001, kernel='rbf', random_state=0),
# naive_bayes.GaussianNB(priors=None),
# neighbors.KNeighborsClassifier(n_neighbors=5),
# neural_network.MLPClassifier(activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(12,), random_state=0),
# ]
# features_train, features_test, labels_train, labels_test = train_test_split(data, target, test_size=0.4, random_state=5)

# print(RED + 'Miscellaneous machine learning models:' + RESET)
@model_analyzer()
def random_forest_hyper_parameters(data, target, file_names, feature_names, labels_key):
default_forest_params = {
'bootstrap': True, 'class_weight': None, 'max_depth': None,
'max_leaf_nodes': None, 'min_impurity_decrease': 0.0,
'min_impurity_split': None, 'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0, 'n_jobs': 1, 'oob_score': False,
'verbose': 0, 'warm_start': False
}

# tabs = 1
# for clf in classifiers:
# print('\n' + PURPLE + '\t' * tabs + clf.__class__.__name__ + RESET)
candidate_params = {
'max_features': range(int(data.shape[1] ** 0.5), data.shape[1]),
'n_estimators': (10, 50, 100),
# 'min_samples_leaf': range(1, int(len(target) ** 0.5)),
# 'criterion': ('gini', 'entropy'),
}
#Best parameters: {'criterion': 'gini', 'max_features': 11, 'min_samples_leaf': 1, 'n_estimators': 100}
print(f'Testing candidate parameters {candidate_params}')

best_params = GridSearchCV(
ensemble.RandomForestClassifier(**default_forest_params), candidate_params,
verbose=2, cv=3,
).fit(data, target).best_params_
print(f'Best parameters: {best_params}')
clf = ensemble.RandomForestClassifier(**default_forest_params, **best_params)
print('accuracy from cross-validation = {}'.format(
statistics.mean(cross_val_score(clf, data, target, scoring='accuracy', cv=5))
))

@model_analyzer()
def sample_classifiers(data, target, file_names, feature_names, labels_key):
#Includes a sample of several the machine learning classifiers
classifiers = [
ensemble.RandomForestClassifier(random_state=0, n_estimators=10),
svm.SVC(gamma=0.00001, kernel='rbf', random_state=0),
naive_bayes.GaussianNB(priors=None),
neighbors.KNeighborsClassifier(n_neighbors=5),
neural_network.MLPClassifier(activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(12,), random_state=0),
]
features_train, features_test, labels_train, labels_test = train_test_split(data, target, test_size=0.4, random_state=5)

print(RED + 'Miscellaneous machine learning models:' + RESET)

# #Parameters used in creating this classifier
# print('\t' * (tabs + 1) + 'Parameters: ' + str(clf.get_params()))
# print()
tabs = 1
for clf in classifiers:
print('\n' + PURPLE + '\t' * tabs + clf.__class__.__name__ + RESET)

# #Train & predict classifier
# clf.fit(features_train, labels_train)
# results = clf.predict(features_test)
# expected = labels_test
#Parameters used in creating this classifier
print('\t' * (tabs + 1) + 'Parameters: ' + str(clf.get_params()))
print()

# _display_stats(expected, results, file_names, labels_key, tabs + 1)
#Train & predict classifier
clf.fit(features_train, labels_train)
results = clf.predict(features_test)
expected = labels_test

# #Cross validation
# scores = cross_val_score(clf, features_train, labels_train, cv=5)
# print('\t' * (tabs + 1) + YELLOW + 'Cross Validation:' + RESET)
# print('\t' * (tabs + 1) + 'Scores: ' + str(scores))
# print('\t' * (tabs + 1) + 'Avg Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
_display_stats(expected, results, file_names, labels_key, tabs + 1)

#Cross validation
scores = cross_val_score(clf, features_train, labels_train, cv=5)
print('\t' * (tabs + 1) + YELLOW + 'Cross Validation:' + RESET)
print('\t' * (tabs + 1) + 'Scores: ' + str(scores))
print('\t' * (tabs + 1) + 'Avg Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
20 changes: 11 additions & 9 deletions run_feature_extraction.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import greek_features #seemingly unused here, but this makes the environment recognize features that are decorated
#pylint: disable = C0330
'''Run feature extraction'''
import os
import sys
from corpus_categories import composite_files, genre_to_files
from functools import reduce

from qcrit.color import RED, RESET
import qcrit.extract_features

from download_corpus import download_corpus
from corpus_categories import composite_files, genre_to_files
#seemingly unused here, but this makes the environment recognize features that are decorated
import greek_features #pylint: disable = unused-import

def main():
'''Main'''
#Validate command line options
categories_to_include = set() if len(sys.argv) <= 2 else set(sys.argv[2:])
if len(sys.argv) > 2 and not all(tok in genre_to_files for tok in categories_to_include):
Expand All @@ -20,18 +23,17 @@ def main():

#Feature extractions
qcrit.extract_features.main(
os.path.join(*corpus_path),
os.path.join(*corpus_path),

'tess',
{'tess': qcrit.extract_features.parse_tess},

#Exclude all files of genres not specified. Exclude composite files no matter what
excluded_paths=composite_files | (set() if len(sys.argv) <= 2 else
reduce(lambda cur_set, next_set: cur_set | next_set,
excluded_paths=composite_files | (set() if len(sys.argv) <= 2 else
reduce(lambda cur_set, next_set: cur_set | next_set,
(genre_to_files[tok] for tok in genre_to_files if tok not in categories_to_include), set())),

output_file=None if len(sys.argv) <= 1 else sys.argv[1]
output_file=None if len(sys.argv) <= 1 else sys.argv[1]
)


if __name__ == '__main__':
main()
13 changes: 8 additions & 5 deletions run_ml_analyzers.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
#pylint: disable = C0330
'''Run Analysis'''
import sys

import ml_analyzers #seemingly unused here, but this makes the environment recognize the model analyzers
#seemingly unused here, but this makes the environment recognize the model analyzers
import ml_analyzers #pylint: disable = unused-import

import qcrit.analyze_models
from qcrit.model_analyzer import DECORATED_ANALYZERS

if __name__ == '__main__':
qcrit.analyze_models.main(
sys.argv[1] if len(sys.argv) > 1 else input('Enter filename to extract feature data: '),
sys.argv[2] if len(sys.argv) > 2 else input('Enter filename to extract classification data: '),
None if len(sys.argv) > 3 and sys.argv[3] == 'all' else
sys.argv[3:] if len(sys.argv) > 3 else input('What would you like to do?\n' +
sys.argv[1] if len(sys.argv) > 1 else input('Enter filename to extract feature data: '),
sys.argv[2] if len(sys.argv) > 2 else input('Enter filename to extract classification data: '),
None if len(sys.argv) > 3 and sys.argv[3] == 'all' else
sys.argv[3:] if len(sys.argv) > 3 else input('What would you like to do?\n' +
'\n'.join(('\t' + name for name in DECORATED_ANALYZERS)) + '\n'
).split()
)
5 changes: 4 additions & 1 deletion test_grc_features.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# pylint: disable = missing-function-docstring, missing-class-docstring
'''Tests'''
import unittest
from greek_features import *
from functools import reduce

from greek_features import * # pylint: disable = wildcard-import, unused-wildcard-import

class TestGreekFeatures(unittest.TestCase):

def setUp(self):
Expand Down

0 comments on commit 6db0043

Please sign in to comment.