dhmit · wilke0818 · Apr 9, 2021 · Apr 9, 2021 · Apr 16, 2021 · May 14, 2021
diff --git a/gender_analysis/analysis/instance_distance.py b/gender_analysis/analysis/instance_distance.py
@@ -318,8 +318,8 @@ def get_highest_distances(results, num):
     """
     Finds the documents with the largest median distances between each gender that was analyzed.
 
-    Returns a dictionary mapping genders to a list of tuples of the form (``Document``, median),
-    where ``Document``\\ s with higher medians are listed first.
+    Returns a dictionary mapping genders to a list of tuples of the form (median, <Document>),
+    where `Document`s with higher medians are listed first.
 
     :param results: dictionary of results from ``run_distance_analysis``
     :param num: number of top distances to return
@@ -332,7 +332,7 @@ def get_highest_distances(results, num):
 
     # Get all of the medians for the documents
     for document in results:
-        for gender in results['document']:
+        for gender in results[document]:
             if gender not in medians:
                 medians[gender] = list()
 

diff --git a/gender_analysis/testing/common.py b/gender_analysis/testing/common.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+from gender_analysis.common import BASE_PATH
+
+TEST_DATA_DIR = Path(BASE_PATH, 'testing', 'test_data')
+
+# All tests using one of the novel corpora points here.
+TEST_CORPUS_PATH = Path(TEST_DATA_DIR, 'sample_novels', 'texts')
+
+# A tiny corpus with only 4 documents
+TINY_TEST_CORPUS_CSV = Path(TEST_DATA_DIR, 'sample_novels', 'tiny_test_corpus.csv')
+
+# A small corpus with only 10 documents
+SMALL_TEST_CORPUS_CSV = Path(TEST_DATA_DIR, 'sample_novels', 'small_test_corpus.csv')
+
+# A larger corpus with 99 documents
+LARGE_TEST_CORPUS_CSV = Path(TEST_DATA_DIR, 'sample_novels', 'large_test_corpus.csv')
+
+# A corpus that is comprised of Reddit posts and comments
+REDDIT_CORPUS_PATH = Path(TEST_DATA_DIR, 'r_starwars_data', 'posts')
+REDDIT_CORPUS_CSV = Path(TEST_DATA_DIR, 'r_starwars_data', 'metadata.csv')
+
+# A directory to a collection of test documents
+DOCUMENT_TEST_PATH = Path(TEST_DATA_DIR, 'document_test_files')
+DOCUMENT_TEST_CSV = Path(TEST_DATA_DIR, 'document_test_files', 'document_test_files.csv')
diff --git a/gender_analysis/testing/test_data/document_test_files/document_test_files.csv b/gender_analysis/testing/test_data/document_test_files/document_test_files.csv
@@ -0,0 +1,15 @@
+author,date,title,country_publication,author_gender,filename,notes
+,1950,,USA,male,test_text_0.txt,
+,1951,,UK,female,test_text_1.txt,
+,1952,,USA,female,test_text_2.txt,
+,1953,,UK,male,test_text_3.txt,
+,1954,,USA,female,test_text_4.txt,
+,1955,,UK,female,test_text_5.txt,
+,1956,,USA,male,test_text_6.txt,
+,1957,,UK,female,test_text_7.txt,
+,1958,,USA,female,test_text_8.txt,
+,1959,,UK,male,test_text_9.txt,
+,1950,,USA,female,test_text_10.txt,
+,1951,,UK,female,test_text_11.txt,
+,1952,,USA,male,test_text_12.txt,
+,1953,,UK,female,test_text_13.txt,
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_0.txt b/gender_analysis/testing/test_data/document_test_files/test_text_0.txt
@@ -0,0 +1 @@
+"This is a quote" and also "This is my quote"
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_1.txt b/gender_analysis/testing/test_data/document_test_files/test_text_1.txt
@@ -0,0 +1 @@
+?!All-kinds %$< of pun*ct(uatio)n {a}nd sp+ecial cha/rs
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_10.txt b/gender_analysis/testing/test_data/document_test_files/test_text_10.txt
@@ -0,0 +1 @@
+Hester was convicted of adultery was convicted.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_11.txt b/gender_analysis/testing/test_data/document_test_files/test_text_11.txt
@@ -0,0 +1 @@
+She took a lighter out of her purse and handed it over to him. He lit his cigarette and took a deep drag from it, and then began his speech which ended in a proposal. Her tears drowned the ring. TBH i know nothing about this story.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_12.txt b/gender_analysis/testing/test_data/document_test_files/test_text_12.txt
@@ -0,0 +1 @@
+She took a lighter out of her purse and handed it over to him. He lit his cigarette and took a deep drag from it, and then began his speech which ended in a proposal. Her tears drowned the ring.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_13.txt b/gender_analysis/testing/test_data/document_test_files/test_text_13.txt
@@ -0,0 +1 @@
+They refuse to permit us to obtain the refuse permit.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_2.txt b/gender_analysis/testing/test_data/document_test_files/test_text_2.txt
@@ -0,0 +1 @@
+Hester was convicted of adultery. which made her very sad, and then Arthur was also sad, and everybody was sad and then Arthur died and it was very sad.  Sadness.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_3.txt b/gender_analysis/testing/test_data/document_test_files/test_text_3.txt
@@ -0,0 +1 @@
+Hester was her convicted of adultery. which made her very sad, and then her Arthur was also sad, and her everybody was sad and then Arthur her died and it was very sad. her Sadness.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_4.txt b/gender_analysis/testing/test_data/document_test_files/test_text_4.txt
@@ -0,0 +1 @@
+James was his convicted of adultery. which made him very sad, and then his Jane was also sad, and himself everybody was sad and then he died and it was very sad. His Sadness.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_5.txt b/gender_analysis/testing/test_data/document_test_files/test_text_5.txt
@@ -0,0 +1 @@
+James was his convicted of adultery. which made him very sad, and then he Arthur was also sad, and himself everybody was sad and then he died and it was very sad. His Sadness.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_6.txt b/gender_analysis/testing/test_data/document_test_files/test_text_6.txt
@@ -0,0 +1 @@
+Hester was her convicted of adultery. which made her very sad, and then she Hester was also sad, and herself everybody was sad and then she died and it was very sad. Her Sadness.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_7.txt b/gender_analysis/testing/test_data/document_test_files/test_text_7.txt
@@ -0,0 +1 @@
+James was convicted of adultery. he was a handsome guy, and everyone thought that he was so handsome, and everybody was sad and then he died a very handsome death. His Sadness.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_8.txt b/gender_analysis/testing/test_data/document_test_files/test_text_8.txt
@@ -0,0 +1 @@
+James was convicted of adultery. he was a handsome guy, and everyone thought that he was so handsome, and everybody was sad and then he died a very handsome death. His Sadness.
diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_9.txt b/gender_analysis/testing/test_data/document_test_files/test_text_9.txt
@@ -0,0 +1 @@
+Jane was convicted of adultery. she was a beautiful gal, and everyone thought that she was very beautiful, and everybody was sad and then she died. Everyone agreed that she was a beautiful corpse that deserved peace.
diff --git a/gender_analysis/testing/test_data/test_pickle.pgz b/gender_analysis/testing/test_data/test_pickle.pgz
diff --git a/gender_analysis/testing/tests/instance_distance_test.py b/gender_analysis/testing/tests/instance_distance_test.py
@@ -0,0 +1,149 @@
+from pathlib import Path
+import pytest
+from corpus_analysis import Corpus, Document
+from gender_analysis import common
+from gender_analysis.analysis.instance_distance import run_distance_analysis, \
+    get_highest_distances, results_by_location, results_by_author_gender, results_by_year
+from gender_analysis.testing import common as test_common
+
+documents = []
+corpus = Corpus(test_common.DOCUMENT_TEST_PATH, csv_path=test_common.DOCUMENT_TEST_CSV)
+for i in range(14):
+    name = "test_text_" + str(i) + ".txt"
+    location = "USA" if i % 2 == 0 else "UK"
+    author_gender = "Male" if i % 3 == 0 else "Female"
+    date = i % 10 + 1950
+    documents.append(Document({"filename": name, "country_publication": location, "date": str(date),
+                               "author_gender": author_gender,
+                     "filepath": Path(test_common.DOCUMENT_TEST_PATH, name)}))
+
+
+class TestInstanceDistance:
+
+    def test_simple_run_distance_analysis_male(self):
+        results = run_distance_analysis(corpus, {common.MALE})
+
+        expected = dict()
+
+        expected[documents[1]] = {}
+        expected[documents[1]][common.MALE] = {"mean": 0, "median": 0, "min": 0, "max": 0}
+
+        expected[documents[4]] = {}
+        expected[documents[4]][common.MALE] = {"mean": 6, "median": 6, "min": 5, "max": 7}
+
+        expected[documents[3]] = {}
+        expected[documents[3]][common.MALE] = {"mean": 0, "median": 0, "min": 0, "max": 0}
+
+        assert expected[documents[1]] == results[documents[1]]
+        assert expected[documents[3]] == results[documents[3]]
+        assert expected[documents[4]] == results[documents[4]]
+
+    def test_simple_run_distance_analysis_female(self):
+        results = run_distance_analysis(corpus, {common.FEMALE})
+
+        expected = dict()
+
+        expected[documents[1]] = {}
+        expected[documents[1]][common.FEMALE] = {"mean": 0, "median": 0, "min": 0, "max": 0}
+
+        expected[documents[4]] = {}
+        expected[documents[4]][common.FEMALE] = {"mean": 0, "median": 0, "min": 0, "max": 0}
+
+        expected[documents[3]] = {}
+        expected[documents[3]][common.FEMALE] = {"mean": 6.2, "median": 6, "min": 5, "max": 7}
+
+        assert expected[documents[1]] == results[documents[1]]
+        assert expected[documents[3]] == results[documents[3]]
+        assert expected[documents[4]] == results[documents[4]]
+
+    def test_simple_run_distance_analysis_unspecified(self):
+        results = run_distance_analysis(corpus)
+
+        expected = dict()
+
+        expected[documents[11]] = {}
+        expected[documents[11]][common.FEMALE] = {"mean": 17.5, "median": 17.5, "min": 6, "max": 29}
+        expected[documents[11]][common.MALE] = {"mean": 5, "median": 2, "min": 1, "max": 12}
+
+        assert expected[documents[11]] == results[documents[11]]
+
+    def test_get_highest_distances(self):
+        results = run_distance_analysis(corpus)
+        results = get_highest_distances(results, 14)
+        print(results)
+        i = 0
+        while i < len(results[common.MALE]):
+            if results[common.MALE][i][1] not in (documents[4], documents[3], documents[11]):
+                results[common.MALE].remove(results[common.MALE][i])
+            else:
+                i += 1
+
+        i = 0
+        while i < len(results[common.FEMALE]):
+            if results[common.FEMALE][i][1] not in (documents[4], documents[3], documents[11]):
+                results[common.FEMALE].remove(results[common.FEMALE][i])
+            else:
+                i += 1
+
+        assert results == {common.MALE: [(6, documents[4]), (2, documents[11]), (0, documents[3])],
+                           common.FEMALE: [(17.5, documents[11]), (6, documents[3]),
+                                           (0, documents[4])]}
+
+    def test_results_by_location(self):
+        results = run_distance_analysis(corpus)
+
+        results = results_by_location(results, "median")
+
+        expected = dict()
+        expected["USA"] = {}
+        expected["UK"] = {}
+        expected["USA"][documents[4]] = {common.MALE: 6, common.FEMALE: 0}
+        expected["UK"][documents[11]] = {common.MALE: 2, common.FEMALE: 17.5}
+        expected["UK"][documents[3]] = {common.MALE: 0, common.FEMALE: 6}
+
+
+        assert results["UK"][documents[11]] == expected["UK"][documents[11]]
+        assert results["UK"][documents[3]] == expected["UK"][documents[3]]
+        assert results["USA"][documents[4]] == expected["USA"][documents[4]]
+
+    def test_results_by_author_gender(self):
+        results = run_distance_analysis(corpus)
+        results = results_by_author_gender(results, "median")
+
+        expected = dict()
+        expected["male"] = {}
+        expected["female"] = {}
+        expected["female"][documents[4]] = {common.MALE: 6, common.FEMALE: 0}
+        expected["female"][documents[11]] = {common.MALE: 2, common.FEMALE: 17.5}
+        expected["male"][documents[3]] = {common.MALE: 0, common.FEMALE: 6}
+
+
+        assert results["female"][documents[11]] == expected["female"][documents[11]]
+        assert results["male"][documents[3]] == expected["male"][documents[3]]
+        assert results["female"][documents[4]] == expected["female"][documents[4]]
+
+    def test_results_by_date(self):
+        results = run_distance_analysis(corpus)
+
+        results = results_by_year(results, "median", (1950, 1962), 1)
+        results[1953].pop(documents[13])
+
+        expected = dict()
+        expected[1953] = {}
+        expected[1954] = {}
+        expected[1954][documents[4]] = {common.MALE: 6, common.FEMALE: 0}
+        expected[1953][documents[3]] = {common.MALE: 0, common.FEMALE: 6}
+
+        assert results[1953] == expected[1953]
+        assert results[1954] == expected[1954]
+
+    def test_results_error_throwing(self):
+        results = run_distance_analysis(corpus)
+        with pytest.raises(ValueError):
+            results2 = results_by_year(results, "mode", (1950, 1962), 1)
+
+        with pytest.raises(ValueError):
+            results2 = results_by_location(results, "mode")
+
+        with pytest.raises(ValueError):
+            results2 = results_by_author_gender(results, "mode")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		?!All-kinds %$< of pun*ct(uatio)n {a}nd sp+ecial cha/rs
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Hester was convicted of adultery was convicted.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		She took a lighter out of her purse and handed it over to him. He lit his cigarette and took a deep drag from it, and then began his speech which ended in a proposal. Her tears drowned the ring. TBH i know nothing about this story.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		They refuse to permit us to obtain the refuse permit.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Hester was convicted of adultery. which made her very sad, and then Arthur was also sad, and everybody was sad and then Arthur died and it was very sad. Sadness.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Hester was her convicted of adultery. which made her very sad, and then her Arthur was also sad, and her everybody was sad and then Arthur her died and it was very sad. her Sadness.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		James was his convicted of adultery. which made him very sad, and then his Jane was also sad, and himself everybody was sad and then he died and it was very sad. His Sadness.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		James was convicted of adultery. he was a handsome guy, and everyone thought that he was so handsome, and everybody was sad and then he died a very handsome death. His Sadness.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Jane was convicted of adultery. she was a beautiful gal, and everyone thought that she was very beautiful, and everybody was sad and then she died. Everyone agreed that she was a beautiful corpse that deserved peace.