diff --git a/gender_analysis/analysis/instance_distance.py b/gender_analysis/analysis/instance_distance.py index f2a4b4e1..32671da5 100644 --- a/gender_analysis/analysis/instance_distance.py +++ b/gender_analysis/analysis/instance_distance.py @@ -318,8 +318,8 @@ def get_highest_distances(results, num): """ Finds the documents with the largest median distances between each gender that was analyzed. - Returns a dictionary mapping genders to a list of tuples of the form (``Document``, median), - where ``Document``\\ s with higher medians are listed first. + Returns a dictionary mapping genders to a list of tuples of the form (median, ), + where `Document`s with higher medians are listed first. :param results: dictionary of results from ``run_distance_analysis`` :param num: number of top distances to return @@ -332,7 +332,7 @@ def get_highest_distances(results, num): # Get all of the medians for the documents for document in results: - for gender in results['document']: + for gender in results[document]: if gender not in medians: medians[gender] = list() diff --git a/gender_analysis/testing/common.py b/gender_analysis/testing/common.py new file mode 100644 index 00000000..3c6f2718 --- /dev/null +++ b/gender_analysis/testing/common.py @@ -0,0 +1,24 @@ +from pathlib import Path +from gender_analysis.common import BASE_PATH + +TEST_DATA_DIR = Path(BASE_PATH, 'testing', 'test_data') + +# All tests using one of the novel corpora points here. +TEST_CORPUS_PATH = Path(TEST_DATA_DIR, 'sample_novels', 'texts') + +# A tiny corpus with only 4 documents +TINY_TEST_CORPUS_CSV = Path(TEST_DATA_DIR, 'sample_novels', 'tiny_test_corpus.csv') + +# A small corpus with only 10 documents +SMALL_TEST_CORPUS_CSV = Path(TEST_DATA_DIR, 'sample_novels', 'small_test_corpus.csv') + +# A larger corpus with 99 documents +LARGE_TEST_CORPUS_CSV = Path(TEST_DATA_DIR, 'sample_novels', 'large_test_corpus.csv') + +# A corpus that is comprised of Reddit posts and comments +REDDIT_CORPUS_PATH = Path(TEST_DATA_DIR, 'r_starwars_data', 'posts') +REDDIT_CORPUS_CSV = Path(TEST_DATA_DIR, 'r_starwars_data', 'metadata.csv') + +# A directory to a collection of test documents +DOCUMENT_TEST_PATH = Path(TEST_DATA_DIR, 'document_test_files') +DOCUMENT_TEST_CSV = Path(TEST_DATA_DIR, 'document_test_files', 'document_test_files.csv') diff --git a/gender_analysis/testing/test_data/document_test_files/document_test_files.csv b/gender_analysis/testing/test_data/document_test_files/document_test_files.csv new file mode 100644 index 00000000..42858b96 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/document_test_files.csv @@ -0,0 +1,15 @@ +author,date,title,country_publication,author_gender,filename,notes +,1950,,USA,male,test_text_0.txt, +,1951,,UK,female,test_text_1.txt, +,1952,,USA,female,test_text_2.txt, +,1953,,UK,male,test_text_3.txt, +,1954,,USA,female,test_text_4.txt, +,1955,,UK,female,test_text_5.txt, +,1956,,USA,male,test_text_6.txt, +,1957,,UK,female,test_text_7.txt, +,1958,,USA,female,test_text_8.txt, +,1959,,UK,male,test_text_9.txt, +,1950,,USA,female,test_text_10.txt, +,1951,,UK,female,test_text_11.txt, +,1952,,USA,male,test_text_12.txt, +,1953,,UK,female,test_text_13.txt, diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_0.txt b/gender_analysis/testing/test_data/document_test_files/test_text_0.txt new file mode 100644 index 00000000..684cbdbd --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_0.txt @@ -0,0 +1 @@ +"This is a quote" and also "This is my quote" diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_1.txt b/gender_analysis/testing/test_data/document_test_files/test_text_1.txt new file mode 100644 index 00000000..8f35d007 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_1.txt @@ -0,0 +1 @@ +?!All-kinds %$< of pun*ct(uatio)n {a}nd sp+ecial cha/rs diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_10.txt b/gender_analysis/testing/test_data/document_test_files/test_text_10.txt new file mode 100644 index 00000000..7bac1358 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_10.txt @@ -0,0 +1 @@ +Hester was convicted of adultery was convicted. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_11.txt b/gender_analysis/testing/test_data/document_test_files/test_text_11.txt new file mode 100644 index 00000000..2016a2b2 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_11.txt @@ -0,0 +1 @@ +She took a lighter out of her purse and handed it over to him. He lit his cigarette and took a deep drag from it, and then began his speech which ended in a proposal. Her tears drowned the ring. TBH i know nothing about this story. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_12.txt b/gender_analysis/testing/test_data/document_test_files/test_text_12.txt new file mode 100644 index 00000000..b2b6e0a7 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_12.txt @@ -0,0 +1 @@ +She took a lighter out of her purse and handed it over to him. He lit his cigarette and took a deep drag from it, and then began his speech which ended in a proposal. Her tears drowned the ring. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_13.txt b/gender_analysis/testing/test_data/document_test_files/test_text_13.txt new file mode 100644 index 00000000..c3c48a04 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_13.txt @@ -0,0 +1 @@ +They refuse to permit us to obtain the refuse permit. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_2.txt b/gender_analysis/testing/test_data/document_test_files/test_text_2.txt new file mode 100644 index 00000000..176e8b6b --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_2.txt @@ -0,0 +1 @@ +Hester was convicted of adultery. which made her very sad, and then Arthur was also sad, and everybody was sad and then Arthur died and it was very sad. Sadness. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_3.txt b/gender_analysis/testing/test_data/document_test_files/test_text_3.txt new file mode 100644 index 00000000..c1d9f2b1 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_3.txt @@ -0,0 +1 @@ +Hester was her convicted of adultery. which made her very sad, and then her Arthur was also sad, and her everybody was sad and then Arthur her died and it was very sad. her Sadness. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_4.txt b/gender_analysis/testing/test_data/document_test_files/test_text_4.txt new file mode 100644 index 00000000..cd7ac3af --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_4.txt @@ -0,0 +1 @@ +James was his convicted of adultery. which made him very sad, and then his Jane was also sad, and himself everybody was sad and then he died and it was very sad. His Sadness. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_5.txt b/gender_analysis/testing/test_data/document_test_files/test_text_5.txt new file mode 100644 index 00000000..500c537f --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_5.txt @@ -0,0 +1 @@ +James was his convicted of adultery. which made him very sad, and then he Arthur was also sad, and himself everybody was sad and then he died and it was very sad. His Sadness. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_6.txt b/gender_analysis/testing/test_data/document_test_files/test_text_6.txt new file mode 100644 index 00000000..47b3ab2f --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_6.txt @@ -0,0 +1 @@ +Hester was her convicted of adultery. which made her very sad, and then she Hester was also sad, and herself everybody was sad and then she died and it was very sad. Her Sadness. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_7.txt b/gender_analysis/testing/test_data/document_test_files/test_text_7.txt new file mode 100644 index 00000000..050732b2 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_7.txt @@ -0,0 +1 @@ +James was convicted of adultery. he was a handsome guy, and everyone thought that he was so handsome, and everybody was sad and then he died a very handsome death. His Sadness. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_8.txt b/gender_analysis/testing/test_data/document_test_files/test_text_8.txt new file mode 100644 index 00000000..050732b2 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_8.txt @@ -0,0 +1 @@ +James was convicted of adultery. he was a handsome guy, and everyone thought that he was so handsome, and everybody was sad and then he died a very handsome death. His Sadness. diff --git a/gender_analysis/testing/test_data/document_test_files/test_text_9.txt b/gender_analysis/testing/test_data/document_test_files/test_text_9.txt new file mode 100644 index 00000000..33810cd0 --- /dev/null +++ b/gender_analysis/testing/test_data/document_test_files/test_text_9.txt @@ -0,0 +1 @@ +Jane was convicted of adultery. she was a beautiful gal, and everyone thought that she was very beautiful, and everybody was sad and then she died. Everyone agreed that she was a beautiful corpse that deserved peace. diff --git a/gender_analysis/testing/test_data/test_pickle.pgz b/gender_analysis/testing/test_data/test_pickle.pgz new file mode 100644 index 00000000..d56450d2 Binary files /dev/null and b/gender_analysis/testing/test_data/test_pickle.pgz differ diff --git a/gender_analysis/testing/tests/instance_distance_test.py b/gender_analysis/testing/tests/instance_distance_test.py new file mode 100644 index 00000000..3b15c25b --- /dev/null +++ b/gender_analysis/testing/tests/instance_distance_test.py @@ -0,0 +1,149 @@ +from pathlib import Path +import pytest +from corpus_analysis import Corpus, Document +from gender_analysis import common +from gender_analysis.analysis.instance_distance import run_distance_analysis, \ + get_highest_distances, results_by_location, results_by_author_gender, results_by_year +from gender_analysis.testing import common as test_common + +documents = [] +corpus = Corpus(test_common.DOCUMENT_TEST_PATH, csv_path=test_common.DOCUMENT_TEST_CSV) +for i in range(14): + name = "test_text_" + str(i) + ".txt" + location = "USA" if i % 2 == 0 else "UK" + author_gender = "Male" if i % 3 == 0 else "Female" + date = i % 10 + 1950 + documents.append(Document({"filename": name, "country_publication": location, "date": str(date), + "author_gender": author_gender, + "filepath": Path(test_common.DOCUMENT_TEST_PATH, name)})) + + +class TestInstanceDistance: + + def test_simple_run_distance_analysis_male(self): + results = run_distance_analysis(corpus, {common.MALE}) + + expected = dict() + + expected[documents[1]] = {} + expected[documents[1]][common.MALE] = {"mean": 0, "median": 0, "min": 0, "max": 0} + + expected[documents[4]] = {} + expected[documents[4]][common.MALE] = {"mean": 6, "median": 6, "min": 5, "max": 7} + + expected[documents[3]] = {} + expected[documents[3]][common.MALE] = {"mean": 0, "median": 0, "min": 0, "max": 0} + + assert expected[documents[1]] == results[documents[1]] + assert expected[documents[3]] == results[documents[3]] + assert expected[documents[4]] == results[documents[4]] + + def test_simple_run_distance_analysis_female(self): + results = run_distance_analysis(corpus, {common.FEMALE}) + + expected = dict() + + expected[documents[1]] = {} + expected[documents[1]][common.FEMALE] = {"mean": 0, "median": 0, "min": 0, "max": 0} + + expected[documents[4]] = {} + expected[documents[4]][common.FEMALE] = {"mean": 0, "median": 0, "min": 0, "max": 0} + + expected[documents[3]] = {} + expected[documents[3]][common.FEMALE] = {"mean": 6.2, "median": 6, "min": 5, "max": 7} + + assert expected[documents[1]] == results[documents[1]] + assert expected[documents[3]] == results[documents[3]] + assert expected[documents[4]] == results[documents[4]] + + def test_simple_run_distance_analysis_unspecified(self): + results = run_distance_analysis(corpus) + + expected = dict() + + expected[documents[11]] = {} + expected[documents[11]][common.FEMALE] = {"mean": 17.5, "median": 17.5, "min": 6, "max": 29} + expected[documents[11]][common.MALE] = {"mean": 5, "median": 2, "min": 1, "max": 12} + + assert expected[documents[11]] == results[documents[11]] + + def test_get_highest_distances(self): + results = run_distance_analysis(corpus) + results = get_highest_distances(results, 14) + print(results) + i = 0 + while i < len(results[common.MALE]): + if results[common.MALE][i][1] not in (documents[4], documents[3], documents[11]): + results[common.MALE].remove(results[common.MALE][i]) + else: + i += 1 + + i = 0 + while i < len(results[common.FEMALE]): + if results[common.FEMALE][i][1] not in (documents[4], documents[3], documents[11]): + results[common.FEMALE].remove(results[common.FEMALE][i]) + else: + i += 1 + + assert results == {common.MALE: [(6, documents[4]), (2, documents[11]), (0, documents[3])], + common.FEMALE: [(17.5, documents[11]), (6, documents[3]), + (0, documents[4])]} + + def test_results_by_location(self): + results = run_distance_analysis(corpus) + + results = results_by_location(results, "median") + + expected = dict() + expected["USA"] = {} + expected["UK"] = {} + expected["USA"][documents[4]] = {common.MALE: 6, common.FEMALE: 0} + expected["UK"][documents[11]] = {common.MALE: 2, common.FEMALE: 17.5} + expected["UK"][documents[3]] = {common.MALE: 0, common.FEMALE: 6} + + + assert results["UK"][documents[11]] == expected["UK"][documents[11]] + assert results["UK"][documents[3]] == expected["UK"][documents[3]] + assert results["USA"][documents[4]] == expected["USA"][documents[4]] + + def test_results_by_author_gender(self): + results = run_distance_analysis(corpus) + results = results_by_author_gender(results, "median") + + expected = dict() + expected["male"] = {} + expected["female"] = {} + expected["female"][documents[4]] = {common.MALE: 6, common.FEMALE: 0} + expected["female"][documents[11]] = {common.MALE: 2, common.FEMALE: 17.5} + expected["male"][documents[3]] = {common.MALE: 0, common.FEMALE: 6} + + + assert results["female"][documents[11]] == expected["female"][documents[11]] + assert results["male"][documents[3]] == expected["male"][documents[3]] + assert results["female"][documents[4]] == expected["female"][documents[4]] + + def test_results_by_date(self): + results = run_distance_analysis(corpus) + + results = results_by_year(results, "median", (1950, 1962), 1) + results[1953].pop(documents[13]) + + expected = dict() + expected[1953] = {} + expected[1954] = {} + expected[1954][documents[4]] = {common.MALE: 6, common.FEMALE: 0} + expected[1953][documents[3]] = {common.MALE: 0, common.FEMALE: 6} + + assert results[1953] == expected[1953] + assert results[1954] == expected[1954] + + def test_results_error_throwing(self): + results = run_distance_analysis(corpus) + with pytest.raises(ValueError): + results2 = results_by_year(results, "mode", (1950, 1962), 1) + + with pytest.raises(ValueError): + results2 = results_by_location(results, "mode") + + with pytest.raises(ValueError): + results2 = results_by_author_gender(results, "mode")