Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions gender_analysis/analysis/instance_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,8 +318,8 @@ def get_highest_distances(results, num):
"""
Finds the documents with the largest median distances between each gender that was analyzed.

Returns a dictionary mapping genders to a list of tuples of the form (``Document``, median),
where ``Document``\\ s with higher medians are listed first.
Returns a dictionary mapping genders to a list of tuples of the form (median, <Document>),
where `Document`s with higher medians are listed first.

:param results: dictionary of results from ``run_distance_analysis``
:param num: number of top distances to return
Expand All @@ -332,7 +332,7 @@ def get_highest_distances(results, num):

# Get all of the medians for the documents
for document in results:
for gender in results['document']:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great catch!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that makes sense in terms of not reduplicating. Might be best to just hold off merging this until that PR is merged and then any changes that need to be made to this PR can be fixed and merged in.

for gender in results[document]:
if gender not in medians:
medians[gender] = list()

Expand Down
24 changes: 24 additions & 0 deletions gender_analysis/testing/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from pathlib import Path
from gender_analysis.common import BASE_PATH

TEST_DATA_DIR = Path(BASE_PATH, 'testing', 'test_data')

# All tests using one of the novel corpora points here.
TEST_CORPUS_PATH = Path(TEST_DATA_DIR, 'sample_novels', 'texts')

# A tiny corpus with only 4 documents
TINY_TEST_CORPUS_CSV = Path(TEST_DATA_DIR, 'sample_novels', 'tiny_test_corpus.csv')

# A small corpus with only 10 documents
SMALL_TEST_CORPUS_CSV = Path(TEST_DATA_DIR, 'sample_novels', 'small_test_corpus.csv')

# A larger corpus with 99 documents
LARGE_TEST_CORPUS_CSV = Path(TEST_DATA_DIR, 'sample_novels', 'large_test_corpus.csv')

# A corpus that is comprised of Reddit posts and comments
REDDIT_CORPUS_PATH = Path(TEST_DATA_DIR, 'r_starwars_data', 'posts')
REDDIT_CORPUS_CSV = Path(TEST_DATA_DIR, 'r_starwars_data', 'metadata.csv')

# A directory to a collection of test documents
DOCUMENT_TEST_PATH = Path(TEST_DATA_DIR, 'document_test_files')
DOCUMENT_TEST_CSV = Path(TEST_DATA_DIR, 'document_test_files', 'document_test_files.csv')
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
author,date,title,country_publication,author_gender,filename,notes
,1950,,USA,male,test_text_0.txt,
,1951,,UK,female,test_text_1.txt,
,1952,,USA,female,test_text_2.txt,
,1953,,UK,male,test_text_3.txt,
,1954,,USA,female,test_text_4.txt,
,1955,,UK,female,test_text_5.txt,
,1956,,USA,male,test_text_6.txt,
,1957,,UK,female,test_text_7.txt,
,1958,,USA,female,test_text_8.txt,
,1959,,UK,male,test_text_9.txt,
,1950,,USA,female,test_text_10.txt,
,1951,,UK,female,test_text_11.txt,
,1952,,USA,male,test_text_12.txt,
,1953,,UK,female,test_text_13.txt,
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"This is a quote" and also "This is my quote"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
?!All-kinds %$< of pun*ct(uatio)n {a}nd sp+ecial cha/rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hester was convicted of adultery was convicted.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
She took a lighter out of her purse and handed it over to him. He lit his cigarette and took a deep drag from it, and then began his speech which ended in a proposal. Her tears drowned the ring. TBH i know nothing about this story.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
She took a lighter out of her purse and handed it over to him. He lit his cigarette and took a deep drag from it, and then began his speech which ended in a proposal. Her tears drowned the ring.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
They refuse to permit us to obtain the refuse permit.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hester was convicted of adultery. which made her very sad, and then Arthur was also sad, and everybody was sad and then Arthur died and it was very sad. Sadness.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hester was her convicted of adultery. which made her very sad, and then her Arthur was also sad, and her everybody was sad and then Arthur her died and it was very sad. her Sadness.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
James was his convicted of adultery. which made him very sad, and then his Jane was also sad, and himself everybody was sad and then he died and it was very sad. His Sadness.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
James was his convicted of adultery. which made him very sad, and then he Arthur was also sad, and himself everybody was sad and then he died and it was very sad. His Sadness.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hester was her convicted of adultery. which made her very sad, and then she Hester was also sad, and herself everybody was sad and then she died and it was very sad. Her Sadness.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
James was convicted of adultery. he was a handsome guy, and everyone thought that he was so handsome, and everybody was sad and then he died a very handsome death. His Sadness.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
James was convicted of adultery. he was a handsome guy, and everyone thought that he was so handsome, and everybody was sad and then he died a very handsome death. His Sadness.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Jane was convicted of adultery. she was a beautiful gal, and everyone thought that she was very beautiful, and everybody was sad and then she died. Everyone agreed that she was a beautiful corpse that deserved peace.
Binary file added gender_analysis/testing/test_data/test_pickle.pgz
Binary file not shown.
149 changes: 149 additions & 0 deletions gender_analysis/testing/tests/instance_distance_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
from pathlib import Path
import pytest
from corpus_analysis import Corpus, Document
from gender_analysis import common
from gender_analysis.analysis.instance_distance import run_distance_analysis, \
get_highest_distances, results_by_location, results_by_author_gender, results_by_year
from gender_analysis.testing import common as test_common

documents = []
corpus = Corpus(test_common.DOCUMENT_TEST_PATH, csv_path=test_common.DOCUMENT_TEST_CSV)
for i in range(14):
name = "test_text_" + str(i) + ".txt"
location = "USA" if i % 2 == 0 else "UK"
author_gender = "Male" if i % 3 == 0 else "Female"
date = i % 10 + 1950
documents.append(Document({"filename": name, "country_publication": location, "date": str(date),
"author_gender": author_gender,
"filepath": Path(test_common.DOCUMENT_TEST_PATH, name)}))


class TestInstanceDistance:

def test_simple_run_distance_analysis_male(self):
results = run_distance_analysis(corpus, {common.MALE})

expected = dict()

expected[documents[1]] = {}
expected[documents[1]][common.MALE] = {"mean": 0, "median": 0, "min": 0, "max": 0}

expected[documents[4]] = {}
expected[documents[4]][common.MALE] = {"mean": 6, "median": 6, "min": 5, "max": 7}

expected[documents[3]] = {}
expected[documents[3]][common.MALE] = {"mean": 0, "median": 0, "min": 0, "max": 0}

assert expected[documents[1]] == results[documents[1]]
assert expected[documents[3]] == results[documents[3]]
assert expected[documents[4]] == results[documents[4]]

def test_simple_run_distance_analysis_female(self):
results = run_distance_analysis(corpus, {common.FEMALE})

expected = dict()

expected[documents[1]] = {}
expected[documents[1]][common.FEMALE] = {"mean": 0, "median": 0, "min": 0, "max": 0}

expected[documents[4]] = {}
expected[documents[4]][common.FEMALE] = {"mean": 0, "median": 0, "min": 0, "max": 0}

expected[documents[3]] = {}
expected[documents[3]][common.FEMALE] = {"mean": 6.2, "median": 6, "min": 5, "max": 7}

assert expected[documents[1]] == results[documents[1]]
assert expected[documents[3]] == results[documents[3]]
assert expected[documents[4]] == results[documents[4]]

def test_simple_run_distance_analysis_unspecified(self):
results = run_distance_analysis(corpus)

expected = dict()

expected[documents[11]] = {}
expected[documents[11]][common.FEMALE] = {"mean": 17.5, "median": 17.5, "min": 6, "max": 29}
expected[documents[11]][common.MALE] = {"mean": 5, "median": 2, "min": 1, "max": 12}

assert expected[documents[11]] == results[documents[11]]

def test_get_highest_distances(self):
results = run_distance_analysis(corpus)
results = get_highest_distances(results, 14)
print(results)
i = 0
while i < len(results[common.MALE]):
if results[common.MALE][i][1] not in (documents[4], documents[3], documents[11]):
results[common.MALE].remove(results[common.MALE][i])
else:
i += 1

i = 0
while i < len(results[common.FEMALE]):
if results[common.FEMALE][i][1] not in (documents[4], documents[3], documents[11]):
results[common.FEMALE].remove(results[common.FEMALE][i])
else:
i += 1

assert results == {common.MALE: [(6, documents[4]), (2, documents[11]), (0, documents[3])],
common.FEMALE: [(17.5, documents[11]), (6, documents[3]),
(0, documents[4])]}

def test_results_by_location(self):
results = run_distance_analysis(corpus)

results = results_by_location(results, "median")

expected = dict()
expected["USA"] = {}
expected["UK"] = {}
expected["USA"][documents[4]] = {common.MALE: 6, common.FEMALE: 0}
expected["UK"][documents[11]] = {common.MALE: 2, common.FEMALE: 17.5}
expected["UK"][documents[3]] = {common.MALE: 0, common.FEMALE: 6}


assert results["UK"][documents[11]] == expected["UK"][documents[11]]
assert results["UK"][documents[3]] == expected["UK"][documents[3]]
assert results["USA"][documents[4]] == expected["USA"][documents[4]]

def test_results_by_author_gender(self):
results = run_distance_analysis(corpus)
results = results_by_author_gender(results, "median")

expected = dict()
expected["male"] = {}
expected["female"] = {}
expected["female"][documents[4]] = {common.MALE: 6, common.FEMALE: 0}
expected["female"][documents[11]] = {common.MALE: 2, common.FEMALE: 17.5}
expected["male"][documents[3]] = {common.MALE: 0, common.FEMALE: 6}


assert results["female"][documents[11]] == expected["female"][documents[11]]
assert results["male"][documents[3]] == expected["male"][documents[3]]
assert results["female"][documents[4]] == expected["female"][documents[4]]

def test_results_by_date(self):
results = run_distance_analysis(corpus)

results = results_by_year(results, "median", (1950, 1962), 1)
results[1953].pop(documents[13])

expected = dict()
expected[1953] = {}
expected[1954] = {}
expected[1954][documents[4]] = {common.MALE: 6, common.FEMALE: 0}
expected[1953][documents[3]] = {common.MALE: 0, common.FEMALE: 6}

assert results[1953] == expected[1953]
assert results[1954] == expected[1954]

def test_results_error_throwing(self):
results = run_distance_analysis(corpus)
with pytest.raises(ValueError):
results2 = results_by_year(results, "mode", (1950, 1962), 1)

with pytest.raises(ValueError):
results2 = results_by_location(results, "mode")

with pytest.raises(ValueError):
results2 = results_by_author_gender(results, "mode")