-
Notifications
You must be signed in to change notification settings - Fork 5
char_list and char_disamb functions in document.py #164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,15 @@ | |
|
|
||
| from corpus_analysis import common | ||
|
|
||
| # for character identification pipeline | ||
| from corpus_analysis.character import HONORIFICS | ||
|
|
||
| import spacy | ||
| nlp = spacy.load('en_core_web_sm') | ||
|
|
||
| # Add neural coref to SpaCy's pipe | ||
| import neuralcoref | ||
| neuralcoref.add_to_pipe(nlp) | ||
|
|
||
| class Document: | ||
| """ | ||
|
|
@@ -637,3 +646,221 @@ def update_metadata(self, new_metadata): | |
| f" not '{new_metadata['date']}'" | ||
| ) from err | ||
| setattr(self, key, new_metadata[key]) | ||
|
|
||
| def get_char_list(self, cutoff_num=10): | ||
| """ | ||
| given a document object, find a list of characters with their frequency in the novels | ||
| :param cutoff_num: cutoff_num defaults to 10 for the thredshold for cutoffs based on named frequency | ||
| :return: a list of tuples with character names in descending sorted order that occurs | ||
| more than the cutoff_num times in the document | ||
| >>> from corpus_analysis import document | ||
| >>> from pathlib import Path | ||
| >>> from gender_analysis import common | ||
| >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818', | ||
| ... 'filename': 'austen_persuasion.txt', | ||
| ... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels', | ||
| ... 'texts', 'austen_persuasion.txt')} | ||
| >>> persuasion = document.Document(document_metadata) | ||
| >>> persuasion_chars = persuasion.get_char_list(20) | ||
| >>> persuasion_chars | ||
| [('Anne', 425), ('Captain Wentworth', 119), ('Lady Russell', 116), ('Charles', 115), ('Mary', 113), ('Sir Walter', 95), ('Elizabeth', 82), ('Elliot', 76), ('Louisa', 66), ('Henrietta', 65), ('Mrs Musgrove', 40), ('Mrs Smith', 36), ('Mrs Clay', 33), ('Miss Elliot', 32), ('Captain Benwick', 32), ('Wentworth', 31), ('Charles Hayter', 29), ('Mrs Croft', 27), ('Benwick', 26), ('Musgrove', 24), ('Uppercross', 23), ('Lady Dalrymple', 22), ('Captain Harville', 21)] | ||
| >>> len(persuasion_chars) | ||
| 23 | ||
| """ | ||
|
|
||
| labels_char = [] | ||
| labels = 'FACILITY,GPE,GSP,LOCATION,ORGANIZATION,PERSON' | ||
| document = self.text | ||
| sentences = nltk.sent_tokenize(document) | ||
| for sent in sentences: | ||
| sentence_tokens = nltk.word_tokenize(sent) | ||
| sentence_pos_tags = nltk.pos_tag(sentence_tokens) | ||
| sentence_chunks = nltk.ne_chunk(sentence_pos_tags) | ||
| for chunk in sentence_chunks: | ||
| #if hasattr(chunk, 'label'): | ||
| if isinstance(chunk, nltk.tree.Tree): | ||
| labels_char.append((chunk.label(), ' '.join(c[0] for c in chunk))) | ||
| char_dict = {lab: {} for lab in labels.split(',')} | ||
| for character in labels_char: | ||
| label, name = character | ||
| char_dict[label][name] = char_dict[label].get(name, 0) + 1 | ||
| #for ch in labels_char: | ||
| #cat = char_dict[ch[0]] | ||
| #cat[ch[1]] = cat.get(ch[1], 0) + 1 | ||
| people = char_dict['PERSON'] | ||
| people_sorted = [(p, people[p]) for p in people if p not in HONORIFICS] | ||
| people_sorted = sorted(people_sorted, key=lambda p: p[1], reverse=True) | ||
| cutoff = len(people_sorted) | ||
| for i in range(len(people_sorted)): | ||
| if people_sorted[i][1] < cutoff_num: | ||
| cutoff = i | ||
| break | ||
| char_list = people_sorted[:cutoff] | ||
|
|
||
| return char_list | ||
|
|
||
| @staticmethod | ||
| def filter_honr(name): | ||
| name = name.split(' ') | ||
| return [n for n in name if n not in HONORIFICS] | ||
|
|
||
| def coref_resolution(self, cutoff_num=20): | ||
| """Use Neuralcoref for getting coref clusters and remove 'extra' ones with no overlap with | ||
| char_list - a prep function for disamb | ||
| output: a cluster of dict type, mapping char_name to its mentions and (optional) pronouns""" | ||
| char_names = self.get_char_list(cutoff_num) | ||
| just_names = [name[0] for name in char_names] | ||
| doc = nlp(self.text) | ||
| coref_clusters = doc._.coref_clusters | ||
| condensed_clusters = {} | ||
|
|
||
| # currently, this function doesn't quite work for it will only return empty clusters | ||
| for clu in coref_clusters: | ||
| mentions_and_pronouns = clu.mentions | ||
| mentions_and_pronouns.append(clu.main) | ||
| if set(mentions_and_pronouns).intersection(set(just_names)): # get rid of extra clusters | ||
| condensed_clusters[clu.main] = mentions_and_pronouns | ||
| return condensed_clusters | ||
|
|
||
|
|
||
| def char_name_disambiguation(self, char_list): | ||
| """given a list of char names in a document, group them by potential nicknames | ||
| :param char_list: a list of character as well as their freq from get_char_list | ||
| :return: a list of list of character names and freq where the first one is the name, | ||
| followed by nicknames | ||
| >>> from corpus_analysis import document | ||
| >>> from pathlib import Path | ||
| >>> from gender_analysis import common | ||
| >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818', | ||
| ... 'filename': 'austen_persuasion.txt', | ||
| ... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels', | ||
| ... 'texts', 'austen_persuasion.txt')} | ||
| >>> persuasion = document.Document(document_metadata) | ||
| >>> persuasion_chars = persuasion.get_char_list(20) | ||
| >>> disamb = persuasion.char_name_disambiguation(persuasion_chars) | ||
| >>> disamb | ||
| [[('Anne', 425)], [('Captain Wentworth', 119), ('Wentworth', 31)], [('Lady Russell', 116)], [('Charles', 115), ('Charles Hayter', 29)], [('Mary', 113)], [('Sir Walter', 95)], [('Elizabeth', 82)], [('Elliot', 76), ('Miss Elliot', 32)], [('Louisa', 66)], [('Henrietta', 65)], [('Mrs Musgrove', 40), ('Musgrove', 24)], [('Mrs Smith', 36)], [('Mrs Clay', 33)], [('Miss Elliot', 32)], [('Captain Benwick', 32), ('Benwick', 26)], [('Wentworth', 31)], [('Charles Hayter', 29)], [('Mrs Croft', 27)], [('Benwick', 26)], [('Musgrove', 24)], [('Uppercross', 23)], [('Lady Dalrymple', 22)]] | ||
| >>> len(disamb) | ||
| 22 | ||
| """ | ||
|
|
||
| # this function needs major reworks to take into the following into account: | ||
| # honorifics; substring comparison; some AI approach?; manual nickname database | ||
|
|
||
| to_return = [] | ||
| for i in range(len(char_list) - 1): | ||
| char_cluster = [char_list[i]] | ||
| for j in range(i + 1, len(char_list)): | ||
| if set(self.filter_honr(char_list[i][0])).intersection( | ||
| set(self.filter_honr(char_list[j][0]))): | ||
| char_cluster.append(char_list[j]) | ||
| to_return.append(char_cluster) | ||
| return to_return | ||
|
|
||
| def manual_disamb_pipeline(self,cutoff_num=10): | ||
| """creates a simple pipeline for human-computer interaction in a console-based environment | ||
| takes in the disambiguated results from char_name_disambiguation, enable users to manually | ||
| disamb, and output a json/txt/csv file | ||
| input format {ideal}: { | ||
| char_list[0] : [(possible_match_1, probability_of_match_1), ...] | ||
| char_list[1] : [(possible_match_1, probability_of_match_1), ...] | ||
| ... | ||
| } | ||
| ### Assume We don't have pronouns in the name clusters, only nicknames ### | ||
| """ | ||
|
|
||
| # this is the ideal version | ||
| char_list = self.get_char_list(cutoff_num) | ||
| name_clusters = self.char_name_disambiguation(char_list) | ||
|
|
||
| # this is what I could do right now without the collective ideal version | ||
| # name_clusters = self.coref_resolution(cutoff_num) | ||
|
|
||
| print("Hello! Welcome to the Disambiguation Pipeline of {}".format(self.title)) | ||
| print("{num} of characters names in total".format(num=len(name_clusters))) | ||
|
|
||
| resolved = {} | ||
|
|
||
| for name in name_clusters: # potential problem: too many name-mention pairs | ||
| resolved[name] = [] | ||
| mentions = name_clusters[name] # assume no pronouns | ||
| for mention in mentions: | ||
| print('Potential Nickname: ', mention) | ||
| choice = input('Is {nickname} a nickname for ' | ||
| '{name}? y/n'.format(nickname=mention, name=name)) | ||
| if choice == 'y': | ||
| resolved[name].append(mention) | ||
| elif choice == 'n': | ||
| pass | ||
| else: | ||
| print('Invalid choice, please enter y/n') | ||
| choice = input('Is {nickname} a nickname for ' | ||
| '{name}? y/n'.format(nickname=mention, name=name)) | ||
| print('{name} has the following nicknames: {mentions}'.format(name=name,mentions=mentions)) | ||
| new_name_choice = input('Is there a nickname that we neglected? y/n') | ||
| if new_name_choice == 'y': | ||
| new_name = input('Please enter such a name: ') | ||
| print('Finished diambiguating {name}'.format(name=name)) | ||
|
|
||
|
|
||
| # output to a file | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
| def char_name_disambiguation(self, char_list): | ||
| """given a list of char names in a document, group them by potential nicknames | ||
| :param char_list: a list of character as well as their freq from get_char_list | ||
| :return: a list of list of character names and freq where the first one is the name, | ||
| followed by nicknames | ||
| >>> from corpus_analysis import document | ||
| >>> from pathlib import Path | ||
| >>> from gender_analysis import common | ||
| >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818', | ||
| ... 'filename': 'austen_persuasion.txt', | ||
| ... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels', | ||
| ... 'texts', 'austen_persuasion.txt')} | ||
| >>> persuasion = document.Document(document_metadata) | ||
| >>> persuasion_chars = persuasion.get_char_list(20) | ||
| >>> disamb = persuasion.char_name_disambiguation(persuasion_chars) | ||
| >>> disamb | ||
| [[('Anne', 425)], [('Captain Wentworth', 119), ('Wentworth', 31)], [('Lady Russell', 116)], [('Charles', 115), ('Charles Hayter', 29)], [('Mary', 113)], [('Sir Walter', 95)], [('Elizabeth', 82)], [('Elliot', 76), ('Miss Elliot', 32)], [('Louisa', 66)], [('Henrietta', 65)], [('Mrs Musgrove', 40), ('Musgrove', 24)], [('Mrs Smith', 36)], [('Mrs Clay', 33)], [('Miss Elliot', 32)], [('Captain Benwick', 32), ('Benwick', 26)], [('Wentworth', 31)], [('Charles Hayter', 29)], [('Mrs Croft', 27)], [('Benwick', 26)], [('Musgrove', 24)], [('Uppercross', 23)], [('Lady Dalrymple', 22)]] | ||
| >>> len(disamb) | ||
| 22 | ||
| """ | ||
| to_return = [] | ||
| for i in range(len(char_list) - 1): | ||
| char_cluster = [char_list[i]] | ||
| for j in range(i + 1, len(char_list)): | ||
| if set(self.filter_honr(char_list[i][0])).intersection( | ||
| set(self.filter_honr(char_list[j][0]))): | ||
| char_cluster.append(char_list[j]) | ||
| to_return.append(char_cluster) | ||
| return to_return | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As we talked about in Slack briefly, this method probably requires some thinking through. If I'm reading the test output in 710 correctly, it looks like the disambiguation is overly generous, and I suspect we can figure out a more optimized way to traverse those character lists. Let's chat through some issues in office hours. |
||
|
|
||
|
|
||
| '''def char_disamb_rev(self,char_list): | ||
| output = {} | ||
| for name in char_list: | ||
| filtered_name = remove_honorifics(name) | ||
| if filtered_name not in output: | ||
| output[filtered_name] = Counter({name: 1}) | ||
| else: | ||
| output[filtered_name] + {name: 1}''' | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,11 +3,9 @@ | |
| 'dunning', | ||
| 'gender_frequency', | ||
| 'instance_distance', | ||
| 'proximity', | ||
| ] | ||
|
|
||
| from gender_analysis.analysis.dependency_parsing import * | ||
| from gender_analysis.analysis.dunning import * | ||
| from gender_analysis.analysis.gender_frequency import * | ||
| from gender_analysis.analysis.instance_distance import * | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A note for posterity. This is a temporary measure to prevent circular imports caused by |
||
| from gender_analysis.analysis.proximity import * | ||
Uh oh!
There was an error while loading. Please reload this page.