Upgrade qcrit version, address pylint warnings

QuantitativeCriticismLab · Dec 18, 2019 · 6db0043 · 6db0043
1 parent 5b6f902
commit 6db0043
Show file tree

Hide file tree

Showing 9 changed files with 225 additions and 153 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -572,4 +572,4 @@ overgeneral-exceptions=BaseException,
 
 [IGNORE WARNINGS]
 
-errors-only=yes
+# errors-only=yes
diff --git a/Pipfile b/Pipfile
@@ -8,7 +8,7 @@ sklearn = "*"
 numpy = "*"
 scipy = "*"
 nltk = "*"
-qcrit = "==0.0.9"
+qcrit = "==0.0.13"
 tqdm = "*"
 
 [dev-packages]

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/corpus_categories.py b/corpus_categories.py
@@ -1,3 +1,4 @@
+# pylint: disable = C
 from collections import OrderedDict
 
 #These files are composites of files that already exist in parts in the tesserae corpus

diff --git a/greek_features.py b/greek_features.py
@@ -1,3 +1,4 @@
+# pylint: disable = missing-function-docstring
 '''
 Greek features
 '''

diff --git a/ml_analyzers.py b/ml_analyzers.py
@@ -1,19 +1,31 @@
-#TODO display files tested? 
+# pylint: disable = trailing-whitespace, C0330, unused-argument
+'''
+Analyzers for a single time period
+'''
 
-
-import warnings
-warnings.filterwarnings('ignore') #TODO consider whether to keep
-from tqdm import tqdm
-import numpy as np
 from functools import reduce
 import statistics
+from collections import Counter
+import warnings
+
+from sklearn.exceptions import UndefinedMetricWarning
+import numpy as np
+from tqdm import tqdm
 import sklearn
 from sklearn import svm, neural_network, naive_bayes, ensemble, neighbors
-from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
-from collections import Counter
-from qcrit.color import RED, GREEN, YELLOW, PURPLE, RESET
+from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
 from qcrit.model_analyzer import model_analyzer
 
+
+#Ignores warning for undefined F1-score when a category is never predicted.
+warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
+
+RED = '\033[91m'
+GREEN = '\033[92m'
+YELLOW = '\033[93m'
+PURPLE = '\033[95m'
+RESET = '\033[0m'
+
 def _display_stats(expected, results, file_names, labels_key, tabs=0):
 	assert len(expected) == len(results)
 
@@ -51,7 +63,7 @@ def _display_stats(expected, results, file_names, labels_key, tabs=0):
 # 	print(RED + 'Random Forest tests' + RESET)
 
 # 	features_train, features_test, labels_train, labels_test = train_test_split(data, target, test_size=0.4, random_state=0)
-# 	clf = ensemble.RandomForestClassifier(random_state=0)
+# 	clf = ensemble.RandomForestClassifier(random_state=0, n_estimators=10)
 # 	clf.fit(features_train, labels_train)
 # 	results = clf.predict(features_test)
 # 	expected = labels_test
@@ -63,7 +75,7 @@ def _display_stats(expected, results, file_names, labels_key, tabs=0):
 @model_analyzer()
 def random_forest_cross_validation(data, target, file_names, feature_names, labels_key):
 	print(RED + 'Random Forest cross validation' + RESET)
-	clf = ensemble.RandomForestClassifier(random_state=0)
+	clf = ensemble.RandomForestClassifier(random_state=0, n_estimators=10)
 	splitter = StratifiedKFold(n_splits=5, shuffle=False, random_state=0)
 	tabs = 1
 
@@ -248,38 +260,66 @@ def random_forest_feature_rankings(data, target, file_names, feature_names, labe
 	for t in sorted([(feat, rank) for feat, rank in feature_rankings.items()], key=lambda s: -1 * s[1].mean()):
 		print('\t' + '%.6f +/- standard deviation of %.4f' % (t[1].mean(), t[1].std()) + ': ' + t[0])
 
-# @model_analyzer()
-# def sample_classifiers(data, target, file_names, feature_names, labels_key):
-# 	#Includes all the machine learning classifiers
-# 	classifiers = [
-# 		ensemble.RandomForestClassifier(random_state=0), 
-# 		svm.SVC(gamma=0.00001, kernel='rbf', random_state=0), 
-# 		naive_bayes.GaussianNB(priors=None), 
-# 		neighbors.KNeighborsClassifier(n_neighbors=5), 
-# 		neural_network.MLPClassifier(activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(12,), random_state=0), 
-# 	]
-# 	features_train, features_test, labels_train, labels_test = train_test_split(data, target, test_size=0.4, random_state=5)
-
-# 	print(RED + 'Miscellaneous machine learning models:' + RESET)
+@model_analyzer()
+def random_forest_hyper_parameters(data, target, file_names, feature_names, labels_key):
+	default_forest_params = {
+		'bootstrap': True, 'class_weight': None, 'max_depth': None, 
+		'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 
+		'min_impurity_split': None, 'min_samples_split': 2, 
+		'min_weight_fraction_leaf': 0.0, 'n_jobs': 1, 'oob_score': False, 
+		'verbose': 0, 'warm_start': False
+	}
 
-# 	tabs = 1
-# 	for clf in classifiers:
-# 		print('\n' + PURPLE + '\t' * tabs + clf.__class__.__name__ + RESET)
+	candidate_params = {
+		'max_features': range(int(data.shape[1] ** 0.5), data.shape[1]),
+		'n_estimators': (10, 50, 100),
+		# 'min_samples_leaf': range(1, int(len(target) ** 0.5)),
+		# 'criterion': ('gini', 'entropy'),
+	}
+	#Best parameters: {'criterion': 'gini', 'max_features': 11, 'min_samples_leaf': 1, 'n_estimators': 100}
+	print(f'Testing candidate parameters {candidate_params}')
+
+	best_params = GridSearchCV(
+		ensemble.RandomForestClassifier(**default_forest_params), candidate_params,
+		verbose=2, cv=3,
+	).fit(data, target).best_params_
+	print(f'Best parameters: {best_params}')
+	clf = ensemble.RandomForestClassifier(**default_forest_params, **best_params)
+	print('accuracy from cross-validation = {}'.format(
+		statistics.mean(cross_val_score(clf, data, target, scoring='accuracy', cv=5))
+	))
+
+@model_analyzer()
+def sample_classifiers(data, target, file_names, feature_names, labels_key):
+	#Includes a sample of several the machine learning classifiers
+	classifiers = [
+		ensemble.RandomForestClassifier(random_state=0, n_estimators=10), 
+		svm.SVC(gamma=0.00001, kernel='rbf', random_state=0), 
+		naive_bayes.GaussianNB(priors=None), 
+		neighbors.KNeighborsClassifier(n_neighbors=5), 
+		neural_network.MLPClassifier(activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(12,), random_state=0), 
+	]
+	features_train, features_test, labels_train, labels_test = train_test_split(data, target, test_size=0.4, random_state=5)
+
+	print(RED + 'Miscellaneous machine learning models:' + RESET)
 
-# 		#Parameters used in creating this classifier
-# 		print('\t' * (tabs + 1) + 'Parameters: ' + str(clf.get_params()))
-# 		print()
+	tabs = 1
+	for clf in classifiers:
+		print('\n' + PURPLE + '\t' * tabs + clf.__class__.__name__ + RESET)
 
-# 		#Train & predict classifier
-# 		clf.fit(features_train, labels_train)
-# 		results = clf.predict(features_test)
-# 		expected = labels_test
+		#Parameters used in creating this classifier
+		print('\t' * (tabs + 1) + 'Parameters: ' + str(clf.get_params()))
+		print()
 
-# 		_display_stats(expected, results, file_names, labels_key, tabs + 1)
+		#Train & predict classifier
+		clf.fit(features_train, labels_train)
+		results = clf.predict(features_test)
+		expected = labels_test
 
-# 		#Cross validation
-# 		scores = cross_val_score(clf, features_train, labels_train, cv=5)
-# 		print('\t' * (tabs + 1) + YELLOW + 'Cross Validation:' + RESET)
-# 		print('\t' * (tabs + 1) + 'Scores: ' + str(scores))
-# 		print('\t' * (tabs + 1) + 'Avg Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
+		_display_stats(expected, results, file_names, labels_key, tabs + 1)
 
+		#Cross validation
+		scores = cross_val_score(clf, features_train, labels_train, cv=5)
+		print('\t' * (tabs + 1) + YELLOW + 'Cross Validation:' + RESET)
+		print('\t' * (tabs + 1) + 'Scores: ' + str(scores))
+		print('\t' * (tabs + 1) + 'Avg Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
diff --git a/run_feature_extraction.py b/run_feature_extraction.py
@@ -1,15 +1,18 @@
-import greek_features #seemingly unused here, but this makes the environment recognize features that are decorated
+#pylint: disable = C0330
+'''Run feature extraction'''
 import os
 import sys
-from corpus_categories import composite_files, genre_to_files
 from functools import reduce
 
-from qcrit.color import RED, RESET
 import qcrit.extract_features
 
 from download_corpus import download_corpus
+from corpus_categories import composite_files, genre_to_files
+#seemingly unused here, but this makes the environment recognize features that are decorated
+import greek_features #pylint: disable = unused-import
 
 def main():
+	'''Main'''
 	#Validate command line options
 	categories_to_include = set() if len(sys.argv) <= 2 else set(sys.argv[2:])
 	if len(sys.argv) > 2 and not all(tok in genre_to_files for tok in categories_to_include):
@@ -20,18 +23,17 @@ def main():
 
 	#Feature extractions
 	qcrit.extract_features.main(
-		os.path.join(*corpus_path), 
+		os.path.join(*corpus_path),
 
-		'tess', 
+		{'tess': qcrit.extract_features.parse_tess},
 
 		#Exclude all files of genres not specified. Exclude composite files no matter what
-		excluded_paths=composite_files | (set() if len(sys.argv) <= 2 else 
-			reduce(lambda cur_set, next_set: cur_set | next_set, 
+		excluded_paths=composite_files | (set() if len(sys.argv) <= 2 else
+			reduce(lambda cur_set, next_set: cur_set | next_set,
 			(genre_to_files[tok] for tok in genre_to_files if tok not in categories_to_include), set())),
 
-		output_file=None if len(sys.argv) <= 1 else sys.argv[1] 
+		output_file=None if len(sys.argv) <= 1 else sys.argv[1]
 	)
 
-
 if __name__ == '__main__':
 	main()
diff --git a/run_ml_analyzers.py b/run_ml_analyzers.py
@@ -1,16 +1,19 @@
+#pylint: disable = C0330
+'''Run Analysis'''
 import sys
 
-import ml_analyzers #seemingly unused here, but this makes the environment recognize the model analyzers
+#seemingly unused here, but this makes the environment recognize the model analyzers
+import ml_analyzers #pylint: disable = unused-import
 
 import qcrit.analyze_models
 from qcrit.model_analyzer import DECORATED_ANALYZERS
 
 if __name__ == '__main__':
 	qcrit.analyze_models.main(
-		sys.argv[1] if len(sys.argv) > 1 else input('Enter filename to extract feature data: '), 
-		sys.argv[2] if len(sys.argv) > 2 else input('Enter filename to extract classification data: '), 
-		None if len(sys.argv) > 3 and sys.argv[3] == 'all' else 
-		sys.argv[3:] if len(sys.argv) > 3 else input('What would you like to do?\n' + 
+		sys.argv[1] if len(sys.argv) > 1 else input('Enter filename to extract feature data: '),
+		sys.argv[2] if len(sys.argv) > 2 else input('Enter filename to extract classification data: '),
+		None if len(sys.argv) > 3 and sys.argv[3] == 'all' else
+		sys.argv[3:] if len(sys.argv) > 3 else input('What would you like to do?\n' +
 		'\n'.join(('\t' + name for name in DECORATED_ANALYZERS)) + '\n'
 		).split()
 	)
diff --git a/test_grc_features.py b/test_grc_features.py
@@ -1,7 +1,10 @@
+# pylint: disable = missing-function-docstring, missing-class-docstring
+'''Tests'''
 import unittest
-from greek_features import *
 from functools import reduce
 
+from greek_features import * # pylint: disable = wildcard-import, unused-wildcard-import
+
 class TestGreekFeatures(unittest.TestCase):
 
 	def setUp(self):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -572,4 +572,4 @@ overgeneral-exceptions=BaseException,

		[IGNORE WARNINGS]

		errors-only=yes
		# errors-only=yes