diff --git a/corpus_analysis/character.py b/corpus_analysis/character.py index 6fc4c8d0..fc60fe59 100644 --- a/corpus_analysis/character.py +++ b/corpus_analysis/character.py @@ -1,6 +1,21 @@ from gender_analysis.pronouns import PronounSeries from gender_analysis.gender import Gender +FEMALE_HONORIFICS = ["Miss", "Mrs", "Ms", "Mistress", "Madam", "Ma'am", "Dame", + "Lady", "Her Honour", "Her Honor", "My Lady", "Your Ladyship", + "Sr", "Sister", "Sayyidah"] +MALE_HONORIFICS = ["Master", "Mr", "Sir", "Gentleman", "Sire", "Lord", "His Honour", + "His Honor", "My Lord", "Your Lordship", "Master", "Esquire", "Esq", + "His Holiness", "Pope", "His All Holiness", "His Beatitude", "The Reverend", + "Rev", "Fr", "Father", "Pr", "Pastor", "Br", "Brother", "Rabbi", "Imam", + "Mufti", "Sayyid", "Captain"] +NEUTRAL_HONORIFICS = ["Mx", "Excellency", "Excellence", "Your Honor", "The Honorable", + "The Honourable", "The Hon", "Hon", "The Hon'ble", "The Right Honourable", + "The Most Honourable", "Dr", "Doctor", "Professor", "QC", "Cl", "S Cl", + "Counsel", "Senior Counsel", "Eur Ing", "Vice-Chancellor", "Principal", + "President", "Warden", "Dean", "Regent", "Rector", "Provost", "Director", + "Chief Executive", "Venerable", "Eminent"] +HONORIFICS = FEMALE_HONORIFICS + MALE_HONORIFICS + NEUTRAL_HONORIFICS class Character: """ diff --git a/corpus_analysis/document.py b/corpus_analysis/document.py index ca5cf7dc..66e9bf22 100644 --- a/corpus_analysis/document.py +++ b/corpus_analysis/document.py @@ -9,6 +9,15 @@ from corpus_analysis import common +# for character identification pipeline +from corpus_analysis.character import HONORIFICS + +import spacy +nlp = spacy.load('en_core_web_sm') + +# Add neural coref to SpaCy's pipe +import neuralcoref +neuralcoref.add_to_pipe(nlp) class Document: """ @@ -637,3 +646,221 @@ def update_metadata(self, new_metadata): f" not '{new_metadata['date']}'" ) from err setattr(self, key, new_metadata[key]) + + def get_char_list(self, cutoff_num=10): + """ + given a document object, find a list of characters with their frequency in the novels + :param cutoff_num: cutoff_num defaults to 10 for the thredshold for cutoffs based on named frequency + :return: a list of tuples with character names in descending sorted order that occurs + more than the cutoff_num times in the document + >>> from corpus_analysis import document + >>> from pathlib import Path + >>> from gender_analysis import common + >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818', + ... 'filename': 'austen_persuasion.txt', + ... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels', + ... 'texts', 'austen_persuasion.txt')} + >>> persuasion = document.Document(document_metadata) + >>> persuasion_chars = persuasion.get_char_list(20) + >>> persuasion_chars + [('Anne', 425), ('Captain Wentworth', 119), ('Lady Russell', 116), ('Charles', 115), ('Mary', 113), ('Sir Walter', 95), ('Elizabeth', 82), ('Elliot', 76), ('Louisa', 66), ('Henrietta', 65), ('Mrs Musgrove', 40), ('Mrs Smith', 36), ('Mrs Clay', 33), ('Miss Elliot', 32), ('Captain Benwick', 32), ('Wentworth', 31), ('Charles Hayter', 29), ('Mrs Croft', 27), ('Benwick', 26), ('Musgrove', 24), ('Uppercross', 23), ('Lady Dalrymple', 22), ('Captain Harville', 21)] + >>> len(persuasion_chars) + 23 + """ + + labels_char = [] + labels = 'FACILITY,GPE,GSP,LOCATION,ORGANIZATION,PERSON' + document = self.text + sentences = nltk.sent_tokenize(document) + for sent in sentences: + sentence_tokens = nltk.word_tokenize(sent) + sentence_pos_tags = nltk.pos_tag(sentence_tokens) + sentence_chunks = nltk.ne_chunk(sentence_pos_tags) + for chunk in sentence_chunks: + #if hasattr(chunk, 'label'): + if isinstance(chunk, nltk.tree.Tree): + labels_char.append((chunk.label(), ' '.join(c[0] for c in chunk))) + char_dict = {lab: {} for lab in labels.split(',')} + for character in labels_char: + label, name = character + char_dict[label][name] = char_dict[label].get(name, 0) + 1 + #for ch in labels_char: + #cat = char_dict[ch[0]] + #cat[ch[1]] = cat.get(ch[1], 0) + 1 + people = char_dict['PERSON'] + people_sorted = [(p, people[p]) for p in people if p not in HONORIFICS] + people_sorted = sorted(people_sorted, key=lambda p: p[1], reverse=True) + cutoff = len(people_sorted) + for i in range(len(people_sorted)): + if people_sorted[i][1] < cutoff_num: + cutoff = i + break + char_list = people_sorted[:cutoff] + + return char_list + + @staticmethod + def filter_honr(name): + name = name.split(' ') + return [n for n in name if n not in HONORIFICS] + + def coref_resolution(self, cutoff_num=20): + """Use Neuralcoref for getting coref clusters and remove 'extra' ones with no overlap with + char_list - a prep function for disamb + output: a cluster of dict type, mapping char_name to its mentions and (optional) pronouns""" + char_names = self.get_char_list(cutoff_num) + just_names = [name[0] for name in char_names] + doc = nlp(self.text) + coref_clusters = doc._.coref_clusters + condensed_clusters = {} + + # currently, this function doesn't quite work for it will only return empty clusters + for clu in coref_clusters: + mentions_and_pronouns = clu.mentions + mentions_and_pronouns.append(clu.main) + if set(mentions_and_pronouns).intersection(set(just_names)): # get rid of extra clusters + condensed_clusters[clu.main] = mentions_and_pronouns + return condensed_clusters + + + def char_name_disambiguation(self, char_list): + """given a list of char names in a document, group them by potential nicknames + :param char_list: a list of character as well as their freq from get_char_list + :return: a list of list of character names and freq where the first one is the name, + followed by nicknames + >>> from corpus_analysis import document + >>> from pathlib import Path + >>> from gender_analysis import common + >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818', + ... 'filename': 'austen_persuasion.txt', + ... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels', + ... 'texts', 'austen_persuasion.txt')} + >>> persuasion = document.Document(document_metadata) + >>> persuasion_chars = persuasion.get_char_list(20) + >>> disamb = persuasion.char_name_disambiguation(persuasion_chars) + >>> disamb + [[('Anne', 425)], [('Captain Wentworth', 119), ('Wentworth', 31)], [('Lady Russell', 116)], [('Charles', 115), ('Charles Hayter', 29)], [('Mary', 113)], [('Sir Walter', 95)], [('Elizabeth', 82)], [('Elliot', 76), ('Miss Elliot', 32)], [('Louisa', 66)], [('Henrietta', 65)], [('Mrs Musgrove', 40), ('Musgrove', 24)], [('Mrs Smith', 36)], [('Mrs Clay', 33)], [('Miss Elliot', 32)], [('Captain Benwick', 32), ('Benwick', 26)], [('Wentworth', 31)], [('Charles Hayter', 29)], [('Mrs Croft', 27)], [('Benwick', 26)], [('Musgrove', 24)], [('Uppercross', 23)], [('Lady Dalrymple', 22)]] + >>> len(disamb) + 22 + """ + + # this function needs major reworks to take into the following into account: + # honorifics; substring comparison; some AI approach?; manual nickname database + + to_return = [] + for i in range(len(char_list) - 1): + char_cluster = [char_list[i]] + for j in range(i + 1, len(char_list)): + if set(self.filter_honr(char_list[i][0])).intersection( + set(self.filter_honr(char_list[j][0]))): + char_cluster.append(char_list[j]) + to_return.append(char_cluster) + return to_return + + def manual_disamb_pipeline(self,cutoff_num=10): + """creates a simple pipeline for human-computer interaction in a console-based environment + takes in the disambiguated results from char_name_disambiguation, enable users to manually + disamb, and output a json/txt/csv file + input format {ideal}: { + char_list[0] : [(possible_match_1, probability_of_match_1), ...] + char_list[1] : [(possible_match_1, probability_of_match_1), ...] + ... + } + ### Assume We don't have pronouns in the name clusters, only nicknames ### + """ + + # this is the ideal version + char_list = self.get_char_list(cutoff_num) + name_clusters = self.char_name_disambiguation(char_list) + + # this is what I could do right now without the collective ideal version + # name_clusters = self.coref_resolution(cutoff_num) + + print("Hello! Welcome to the Disambiguation Pipeline of {}".format(self.title)) + print("{num} of characters names in total".format(num=len(name_clusters))) + + resolved = {} + + for name in name_clusters: # potential problem: too many name-mention pairs + resolved[name] = [] + mentions = name_clusters[name] # assume no pronouns + for mention in mentions: + print('Potential Nickname: ', mention) + choice = input('Is {nickname} a nickname for ' + '{name}? y/n'.format(nickname=mention, name=name)) + if choice == 'y': + resolved[name].append(mention) + elif choice == 'n': + pass + else: + print('Invalid choice, please enter y/n') + choice = input('Is {nickname} a nickname for ' + '{name}? y/n'.format(nickname=mention, name=name)) + print('{name} has the following nicknames: {mentions}'.format(name=name,mentions=mentions)) + new_name_choice = input('Is there a nickname that we neglected? y/n') + if new_name_choice == 'y': + new_name = input('Please enter such a name: ') + print('Finished diambiguating {name}'.format(name=name)) + + + # output to a file + + + + + + + + + + + + + + + + + + + + + + def char_name_disambiguation(self, char_list): + """given a list of char names in a document, group them by potential nicknames + :param char_list: a list of character as well as their freq from get_char_list + :return: a list of list of character names and freq where the first one is the name, + followed by nicknames + >>> from corpus_analysis import document + >>> from pathlib import Path + >>> from gender_analysis import common + >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818', + ... 'filename': 'austen_persuasion.txt', + ... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels', + ... 'texts', 'austen_persuasion.txt')} + >>> persuasion = document.Document(document_metadata) + >>> persuasion_chars = persuasion.get_char_list(20) + >>> disamb = persuasion.char_name_disambiguation(persuasion_chars) + >>> disamb + [[('Anne', 425)], [('Captain Wentworth', 119), ('Wentworth', 31)], [('Lady Russell', 116)], [('Charles', 115), ('Charles Hayter', 29)], [('Mary', 113)], [('Sir Walter', 95)], [('Elizabeth', 82)], [('Elliot', 76), ('Miss Elliot', 32)], [('Louisa', 66)], [('Henrietta', 65)], [('Mrs Musgrove', 40), ('Musgrove', 24)], [('Mrs Smith', 36)], [('Mrs Clay', 33)], [('Miss Elliot', 32)], [('Captain Benwick', 32), ('Benwick', 26)], [('Wentworth', 31)], [('Charles Hayter', 29)], [('Mrs Croft', 27)], [('Benwick', 26)], [('Musgrove', 24)], [('Uppercross', 23)], [('Lady Dalrymple', 22)]] + >>> len(disamb) + 22 + """ + to_return = [] + for i in range(len(char_list) - 1): + char_cluster = [char_list[i]] + for j in range(i + 1, len(char_list)): + if set(self.filter_honr(char_list[i][0])).intersection( + set(self.filter_honr(char_list[j][0]))): + char_cluster.append(char_list[j]) + to_return.append(char_cluster) + return to_return + + + '''def char_disamb_rev(self,char_list): + output = {} + for name in char_list: + filtered_name = remove_honorifics(name) + if filtered_name not in output: + output[filtered_name] = Counter({name: 1}) + else: + output[filtered_name] + {name: 1}''' diff --git a/gender_analysis/analysis/__init__.py b/gender_analysis/analysis/__init__.py index c9ffb688..a100175c 100644 --- a/gender_analysis/analysis/__init__.py +++ b/gender_analysis/analysis/__init__.py @@ -3,11 +3,9 @@ 'dunning', 'gender_frequency', 'instance_distance', - 'proximity', ] from gender_analysis.analysis.dependency_parsing import * from gender_analysis.analysis.dunning import * from gender_analysis.analysis.gender_frequency import * from gender_analysis.analysis.instance_distance import * -from gender_analysis.analysis.proximity import * diff --git a/gender_analysis/analysis/proximity.py b/gender_analysis/analysis/proximity.py index 034bd1f4..b58d14bd 100644 --- a/gender_analysis/analysis/proximity.py +++ b/gender_analysis/analysis/proximity.py @@ -79,7 +79,7 @@ def _diff_gender_token_counters(gender_token_counters: GenderTokenCounters, return difference_dict -def _generate_token_counter(document: Document, +def _generate_token_counter(document, gender_to_find: Gender, word_window: int, tags: Sequence[str], @@ -139,7 +139,7 @@ def _generate_token_counter(document: Document, return output -def _generate_gender_token_counters(document: Document, +def _generate_gender_token_counters(document, genders: Sequence[Gender], tags: Sequence[str], word_window: int) -> GenderTokenCounters: @@ -278,7 +278,7 @@ def _sort_token_counter(token_counter: Counter, return output_token_counter.most_common(limit) -def find_in_document_gender(document: Document, +def find_in_document_gender(document, gender: Gender, tags: Sequence[str] = None, word_window: int = 5, @@ -313,7 +313,7 @@ def find_in_document_gender(document: Document, genders_to_exclude=genders_to_exclude) -def find_in_document_female(document: Document, +def find_in_document_female(document, tags: Sequence[str] = None, word_window: int = 5) -> Counter: """ @@ -391,7 +391,7 @@ class GenderProximityAnalyzer: """ def __init__(self, - texts: Union[Document, Corpus, Sequence[Document]], + texts, tags: Optional[Sequence[str]] = None, genders: Optional[Sequence[Gender]] = None, word_window: int = 5) -> None: diff --git a/requirements.txt b/requirements.txt index 140aef65..294d9340 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,5 @@ scipy wordcloud coverage==5.3 pytest==6.0.2 +-e git+https://github.com/huggingface/neuralcoref.git@0cff3c94e6019f6bee1004b58a3f0cd59c806fcf#egg=neuralcoref +