Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9e5ba43
Added gender_pos.py
kenalba Oct 23, 2020
7b1abee
Merge branch 'master' into gender_pos
DocEon Dec 4, 2020
a9b2fc5
Adding gender_pos, refactoring gender_adjective.
DocEon Dec 4, 2020
0b36711
Linting upgrades!
DocEon Dec 4, 2020
fdee52b
More, better linting.
DocEon Dec 4, 2020
262cb8b
Changing how importing stuff works
kenalba Dec 15, 2020
042e64f
Added difference_pos_freq, pos_analysis, and display functions.
kenalba Dec 26, 2020
15d20eb
Adding documentation, handling absent text files.
kenalba Dec 27, 2020
441017c
initialized character.py
fyang3 Dec 31, 2020
719cf2e
added get_char_list function in document.py
fyang3 Dec 31, 2020
eb4119a
refined get_char_list and integrated gender identification into Chara…
fyang3 Jan 5, 2021
93fe95b
refactored for seamless integration of gender identification based on…
fyang3 Jan 8, 2021
b308a9c
created first pass for character_pos
fyang3 Jan 8, 2021
d9e76bf
tuned character.py to make it include gender obj instead of strings
fyang3 Jan 8, 2021
e046ef1
Bugbash 1
DocEon Feb 2, 2021
00c6ce2
Bug bashing!
DocEon Feb 2, 2021
1656655
fixed unable to load model error and get char list
fyang3 Feb 4, 2021
9469dcf
Merge branch 'character_class' of https://github.com/dhmit/gender_ana…
fyang3 Feb 4, 2021
e94481c
Adding similarity index stuff
kenalba Feb 9, 2021
9f7a6f2
revised char_disambiguation and create_char_objects
fyang3 Feb 9, 2021
1fbf6ec
fixed make_char_name bugs and refactor
fyang3 Feb 9, 2021
949b1bb
expanded honorifics list and robust gender detection functionality
fyang3 Feb 9, 2021
f049986
added honorifics for a more comprehensive coverage
fyang3 Apr 15, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions gender_analysis/analysis/character_pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from more_itertools import windowed
import nltk

from gender_analysis.common import ADJ_TAGS, ADV_TAGS, PROPER_NOUN_TAGS, VERB_TAGS
from gender_analysis import common
pos_dict = {'adj': ADJ_TAGS, 'adv': ADV_TAGS,
'proper_noun': PROPER_NOUN_TAGS, "verb": VERB_TAGS}

# I've migrated functions from gender_pos to here and modify them for character, but maybe should
# add new functions in gender_pos instead

# first pass on character adj based on gender pos
def find_char_pos(pos_to_find, char_to_find, word_window=5):
# pylint: disable=too-many-locals
"""
Takes in a document, a valid part-of-speech, and a Gender to look for,
and returns a dictionary of words that appear within a window of 5 words around each identifier.

:param document: Document
:param pos_to_find: A valid part of speech tag from pos_dict: ['adj','adv','proper_noun','verb']
:param word_window: number of words to search for in either direction of a gender instance
:param char_to_find: Character
:return: dict of words that appear around pronouns mapped to the number of occurrences

>>> from gender_analysis import document
>>> from pathlib import Path
>>> from gender_analysis import common
>>> document_metadata = {'author': 'Hawthorne, Nathaniel', 'title': 'Scarlet Letter', 'date': \
'1966', 'filename': 'test_text_7.txt', 'filepath': Path(common.TEST_DATA_PATH, \
'document_test_files', 'test_text_7.txt')}
>>> scarlett = document.Document(document_metadata)
>>> find_gender_pos(scarlett, 'adj', common.MALE, genders_to_exclude=[common.FEMALE])
{'handsome': 3, 'sad': 1}

"""
output = {}
text = char_to_find.document.get_tokenized_text()

if pos_to_find in pos_dict.keys():
pos_tags = pos_dict[pos_to_find]
else:
return "Invalid part of speech"
# rewrite based on character class
identifiers_to_find = [char_to_find.name] + [char_to_find.nicknames] # identifiers: names and nicknames

for words in windowed(text, 2 * word_window + 1):
if not words[word_window].lower() in identifiers_to_find:
continue
if bool(set(words)):
continue

words = list(words)
for index, word in enumerate(words):
words[index] = word.lower()

tags = nltk.pos_tag(words)
for tag_index, _ in enumerate(tags):
if tags[tag_index][1] in pos_tags:
word = words[tag_index]
if word in output.keys():
output[word] += 1
else:
output[word] = 1

return output


def find_char_adj(char_to_find, word_window=5):
# pylint: disable=too-many-locals
"""
Takes in a document and a Character to look for, and returns a dictionary of adjectives that
appear within a window of 5 words around each identifier
:param char_to_find: Character
:param word_window: number of words to search for in either direction of a gender instance
:return: dict of adjectives that appear around pronouns mapped to the number of occurrences

>>> from gender_analysis import document
>>> from pathlib import Path
>>> from gender_analysis import common
>>> document_metadata = {'author': 'Hawthorne, Nathaniel', 'title': 'Scarlet Letter', 'date': \
'1966', 'filename': 'test_text_7.txt', 'filepath': Path(common.TEST_DATA_PATH, \
'document_test_files', 'test_text_7.txt')}
>>> scarlett = document.Document(document_metadata)
>>> find_gender_adj(scarlett, common.MALE, genders_to_exclude=[common.FEMALE])
{'handsome': 3, 'sad': 1}

"""

return find_char_pos('adj', char_to_find, word_window)
Loading