Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions corpus_analysis/character.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
from gender_analysis.pronouns import PronounSeries
from gender_analysis.gender import Gender

FEMALE_HONORIFICS = ["Miss", "Mrs", "Ms", "Mistress", "Madam", "Ma'am", "Dame",
"Lady", "Her Honour", "Her Honor", "My Lady", "Your Ladyship",
"Sr", "Sister", "Sayyidah"]
MALE_HONORIFICS = ["Master", "Mr", "Sir", "Gentleman", "Sire", "Lord", "His Honour",
"His Honor", "My Lord", "Your Lordship", "Master", "Esquire", "Esq",
"His Holiness", "Pope", "His All Holiness", "His Beatitude", "The Reverend",
"Rev", "Fr", "Father", "Pr", "Pastor", "Br", "Brother", "Rabbi", "Imam",
"Mufti", "Sayyid", "Captain"]
NEUTRAL_HONORIFICS = ["Mx", "Excellency", "Excellence", "Your Honor", "The Honorable",
"The Honourable", "The Hon", "Hon", "The Hon'ble", "The Right Honourable",
"The Most Honourable", "Dr", "Doctor", "Professor", "QC", "Cl", "S Cl",
"Counsel", "Senior Counsel", "Eur Ing", "Vice-Chancellor", "Principal",
"President", "Warden", "Dean", "Regent", "Rector", "Provost", "Director",
"Chief Executive", "Venerable", "Eminent"]
HONORIFICS = FEMALE_HONORIFICS + MALE_HONORIFICS + NEUTRAL_HONORIFICS

class Character:
"""
Expand Down
227 changes: 227 additions & 0 deletions corpus_analysis/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@

from corpus_analysis import common

# for character identification pipeline
from corpus_analysis.character import HONORIFICS

import spacy
nlp = spacy.load('en_core_web_sm')

# Add neural coref to SpaCy's pipe
import neuralcoref
neuralcoref.add_to_pipe(nlp)

class Document:
"""
Expand Down Expand Up @@ -637,3 +646,221 @@ def update_metadata(self, new_metadata):
f" not '{new_metadata['date']}'"
) from err
setattr(self, key, new_metadata[key])

def get_char_list(self, cutoff_num=10):
"""
given a document object, find a list of characters with their frequency in the novels
:param cutoff_num: cutoff_num defaults to 10 for the thredshold for cutoffs based on named frequency
:return: a list of tuples with character names in descending sorted order that occurs
more than the cutoff_num times in the document
>>> from corpus_analysis import document
>>> from pathlib import Path
>>> from gender_analysis import common
>>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818',
... 'filename': 'austen_persuasion.txt',
... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels',
... 'texts', 'austen_persuasion.txt')}
>>> persuasion = document.Document(document_metadata)
>>> persuasion_chars = persuasion.get_char_list(20)
>>> persuasion_chars
[('Anne', 425), ('Captain Wentworth', 119), ('Lady Russell', 116), ('Charles', 115), ('Mary', 113), ('Sir Walter', 95), ('Elizabeth', 82), ('Elliot', 76), ('Louisa', 66), ('Henrietta', 65), ('Mrs Musgrove', 40), ('Mrs Smith', 36), ('Mrs Clay', 33), ('Miss Elliot', 32), ('Captain Benwick', 32), ('Wentworth', 31), ('Charles Hayter', 29), ('Mrs Croft', 27), ('Benwick', 26), ('Musgrove', 24), ('Uppercross', 23), ('Lady Dalrymple', 22), ('Captain Harville', 21)]
>>> len(persuasion_chars)
23
"""

labels_char = []
labels = 'FACILITY,GPE,GSP,LOCATION,ORGANIZATION,PERSON'
document = self.text
sentences = nltk.sent_tokenize(document)
for sent in sentences:
sentence_tokens = nltk.word_tokenize(sent)
sentence_pos_tags = nltk.pos_tag(sentence_tokens)
sentence_chunks = nltk.ne_chunk(sentence_pos_tags)
for chunk in sentence_chunks:
#if hasattr(chunk, 'label'):
if isinstance(chunk, nltk.tree.Tree):
labels_char.append((chunk.label(), ' '.join(c[0] for c in chunk)))
char_dict = {lab: {} for lab in labels.split(',')}
for character in labels_char:
label, name = character
char_dict[label][name] = char_dict[label].get(name, 0) + 1
#for ch in labels_char:
#cat = char_dict[ch[0]]
#cat[ch[1]] = cat.get(ch[1], 0) + 1
people = char_dict['PERSON']
people_sorted = [(p, people[p]) for p in people if p not in HONORIFICS]
people_sorted = sorted(people_sorted, key=lambda p: p[1], reverse=True)
cutoff = len(people_sorted)
for i in range(len(people_sorted)):
if people_sorted[i][1] < cutoff_num:
cutoff = i
break
char_list = people_sorted[:cutoff]

return char_list

@staticmethod
def filter_honr(name):
name = name.split(' ')
return [n for n in name if n not in HONORIFICS]

def coref_resolution(self, cutoff_num=20):
"""Use Neuralcoref for getting coref clusters and remove 'extra' ones with no overlap with
char_list - a prep function for disamb
output: a cluster of dict type, mapping char_name to its mentions and (optional) pronouns"""
char_names = self.get_char_list(cutoff_num)
just_names = [name[0] for name in char_names]
doc = nlp(self.text)
coref_clusters = doc._.coref_clusters
condensed_clusters = {}

# currently, this function doesn't quite work for it will only return empty clusters
for clu in coref_clusters:
mentions_and_pronouns = clu.mentions
mentions_and_pronouns.append(clu.main)
if set(mentions_and_pronouns).intersection(set(just_names)): # get rid of extra clusters
condensed_clusters[clu.main] = mentions_and_pronouns
return condensed_clusters


def char_name_disambiguation(self, char_list):
"""given a list of char names in a document, group them by potential nicknames
:param char_list: a list of character as well as their freq from get_char_list
:return: a list of list of character names and freq where the first one is the name,
followed by nicknames
>>> from corpus_analysis import document
>>> from pathlib import Path
>>> from gender_analysis import common
>>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818',
... 'filename': 'austen_persuasion.txt',
... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels',
... 'texts', 'austen_persuasion.txt')}
>>> persuasion = document.Document(document_metadata)
>>> persuasion_chars = persuasion.get_char_list(20)
>>> disamb = persuasion.char_name_disambiguation(persuasion_chars)
>>> disamb
[[('Anne', 425)], [('Captain Wentworth', 119), ('Wentworth', 31)], [('Lady Russell', 116)], [('Charles', 115), ('Charles Hayter', 29)], [('Mary', 113)], [('Sir Walter', 95)], [('Elizabeth', 82)], [('Elliot', 76), ('Miss Elliot', 32)], [('Louisa', 66)], [('Henrietta', 65)], [('Mrs Musgrove', 40), ('Musgrove', 24)], [('Mrs Smith', 36)], [('Mrs Clay', 33)], [('Miss Elliot', 32)], [('Captain Benwick', 32), ('Benwick', 26)], [('Wentworth', 31)], [('Charles Hayter', 29)], [('Mrs Croft', 27)], [('Benwick', 26)], [('Musgrove', 24)], [('Uppercross', 23)], [('Lady Dalrymple', 22)]]
>>> len(disamb)
22
"""

# this function needs major reworks to take into the following into account:
# honorifics; substring comparison; some AI approach?; manual nickname database

to_return = []
for i in range(len(char_list) - 1):
char_cluster = [char_list[i]]
for j in range(i + 1, len(char_list)):
if set(self.filter_honr(char_list[i][0])).intersection(
set(self.filter_honr(char_list[j][0]))):
char_cluster.append(char_list[j])
to_return.append(char_cluster)
return to_return

def manual_disamb_pipeline(self,cutoff_num=10):
"""creates a simple pipeline for human-computer interaction in a console-based environment
takes in the disambiguated results from char_name_disambiguation, enable users to manually
disamb, and output a json/txt/csv file
input format {ideal}: {
char_list[0] : [(possible_match_1, probability_of_match_1), ...]
char_list[1] : [(possible_match_1, probability_of_match_1), ...]
...
}
### Assume We don't have pronouns in the name clusters, only nicknames ###
"""

# this is the ideal version
char_list = self.get_char_list(cutoff_num)
name_clusters = self.char_name_disambiguation(char_list)

# this is what I could do right now without the collective ideal version
# name_clusters = self.coref_resolution(cutoff_num)

print("Hello! Welcome to the Disambiguation Pipeline of {}".format(self.title))
print("{num} of characters names in total".format(num=len(name_clusters)))

resolved = {}

for name in name_clusters: # potential problem: too many name-mention pairs
resolved[name] = []
mentions = name_clusters[name] # assume no pronouns
for mention in mentions:
print('Potential Nickname: ', mention)
choice = input('Is {nickname} a nickname for '
'{name}? y/n'.format(nickname=mention, name=name))
if choice == 'y':
resolved[name].append(mention)
elif choice == 'n':
pass
else:
print('Invalid choice, please enter y/n')
choice = input('Is {nickname} a nickname for '
'{name}? y/n'.format(nickname=mention, name=name))
print('{name} has the following nicknames: {mentions}'.format(name=name,mentions=mentions))
new_name_choice = input('Is there a nickname that we neglected? y/n')
if new_name_choice == 'y':
new_name = input('Please enter such a name: ')
print('Finished diambiguating {name}'.format(name=name))


# output to a file





















def char_name_disambiguation(self, char_list):
"""given a list of char names in a document, group them by potential nicknames
:param char_list: a list of character as well as their freq from get_char_list
:return: a list of list of character names and freq where the first one is the name,
followed by nicknames
>>> from corpus_analysis import document
>>> from pathlib import Path
>>> from gender_analysis import common
>>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818',
... 'filename': 'austen_persuasion.txt',
... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels',
... 'texts', 'austen_persuasion.txt')}
>>> persuasion = document.Document(document_metadata)
>>> persuasion_chars = persuasion.get_char_list(20)
>>> disamb = persuasion.char_name_disambiguation(persuasion_chars)
>>> disamb
[[('Anne', 425)], [('Captain Wentworth', 119), ('Wentworth', 31)], [('Lady Russell', 116)], [('Charles', 115), ('Charles Hayter', 29)], [('Mary', 113)], [('Sir Walter', 95)], [('Elizabeth', 82)], [('Elliot', 76), ('Miss Elliot', 32)], [('Louisa', 66)], [('Henrietta', 65)], [('Mrs Musgrove', 40), ('Musgrove', 24)], [('Mrs Smith', 36)], [('Mrs Clay', 33)], [('Miss Elliot', 32)], [('Captain Benwick', 32), ('Benwick', 26)], [('Wentworth', 31)], [('Charles Hayter', 29)], [('Mrs Croft', 27)], [('Benwick', 26)], [('Musgrove', 24)], [('Uppercross', 23)], [('Lady Dalrymple', 22)]]
>>> len(disamb)
22
"""
to_return = []
for i in range(len(char_list) - 1):
char_cluster = [char_list[i]]
for j in range(i + 1, len(char_list)):
if set(self.filter_honr(char_list[i][0])).intersection(
set(self.filter_honr(char_list[j][0]))):
char_cluster.append(char_list[j])
to_return.append(char_cluster)
return to_return
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we talked about in Slack briefly, this method probably requires some thinking through. If I'm reading the test output in 710 correctly, it looks like the disambiguation is overly generous, and I suspect we can figure out a more optimized way to traverse those character lists. Let's chat through some issues in office hours.



'''def char_disamb_rev(self,char_list):
output = {}
for name in char_list:
filtered_name = remove_honorifics(name)
if filtered_name not in output:
output[filtered_name] = Counter({name: 1})
else:
output[filtered_name] + {name: 1}'''
2 changes: 0 additions & 2 deletions gender_analysis/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
'dunning',
'gender_frequency',
'instance_distance',
'proximity',
]

from gender_analysis.analysis.dependency_parsing import *
from gender_analysis.analysis.dunning import *
from gender_analysis.analysis.gender_frequency import *
from gender_analysis.analysis.instance_distance import *
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A note for posterity. This is a temporary measure to prevent circular imports caused by proximity.py importing Corpus for type hinting. PR #163 attempts to address the issue more fundamentally.

from gender_analysis.analysis.proximity import *
10 changes: 5 additions & 5 deletions gender_analysis/analysis/proximity.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def _diff_gender_token_counters(gender_token_counters: GenderTokenCounters,
return difference_dict


def _generate_token_counter(document: Document,
def _generate_token_counter(document,
gender_to_find: Gender,
word_window: int,
tags: Sequence[str],
Expand Down Expand Up @@ -139,7 +139,7 @@ def _generate_token_counter(document: Document,
return output


def _generate_gender_token_counters(document: Document,
def _generate_gender_token_counters(document,
genders: Sequence[Gender],
tags: Sequence[str],
word_window: int) -> GenderTokenCounters:
Expand Down Expand Up @@ -278,7 +278,7 @@ def _sort_token_counter(token_counter: Counter,
return output_token_counter.most_common(limit)


def find_in_document_gender(document: Document,
def find_in_document_gender(document,
gender: Gender,
tags: Sequence[str] = None,
word_window: int = 5,
Expand Down Expand Up @@ -313,7 +313,7 @@ def find_in_document_gender(document: Document,
genders_to_exclude=genders_to_exclude)


def find_in_document_female(document: Document,
def find_in_document_female(document,
tags: Sequence[str] = None,
word_window: int = 5) -> Counter:
"""
Expand Down Expand Up @@ -391,7 +391,7 @@ class GenderProximityAnalyzer:
"""

def __init__(self,
texts: Union[Document, Corpus, Sequence[Document]],
texts,
tags: Optional[Sequence[str]] = None,
genders: Optional[Sequence[Gender]] = None,
word_window: int = 5) -> None:
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ scipy
wordcloud
coverage==5.3
pytest==6.0.2
-e git+https://github.com/huggingface/neuralcoref.git@0cff3c94e6019f6bee1004b58a3f0cd59c806fcf#egg=neuralcoref