diff --git a/generators/__init__.py b/generators/__init__.py index d25aae0e..f70ce685 100644 --- a/generators/__init__.py +++ b/generators/__init__.py @@ -1,3 +1,4 @@ + from fastapi import APIRouter from .distance import ( @@ -11,6 +12,10 @@ spacy_lemmatizer, ) +from .numbers import ( + annotator_split +) + from .paths import ( url_keyword_parser, domain_parser, @@ -45,7 +50,11 @@ syllable_count, ) -from .text_cleaning import html_cleanser, html_unescape +from .text_cleaning import ( + email_cleaner, + html_cleanser, + html_unescape, +) from .translation import ( deepl_translator, @@ -96,6 +105,8 @@ bert_toxicity_detector, gpt_grammar_correction, gpt_tldr_summarization, + email_cleaner, + annotator_split, ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/generators/text_cleaning/email_cleaner/README.md b/generators/text_cleaning/email_cleaner/README.md new file mode 100644 index 00000000..daae5e9b --- /dev/null +++ b/generators/text_cleaning/email_cleaner/README.md @@ -0,0 +1 @@ +This modules removes certain aspects of the email to focus on the context. The aspects being removed are Sentences starting with "EXTERNAL MAIL", Brackets, that starts with "cid.image", everything after the Disclaimer or until the next Mail in case of response-mails and everything after the signiture in brackets or until the next Mail. \ No newline at end of file diff --git a/generators/text_cleaning/email_cleaner/__init__.py b/generators/text_cleaning/email_cleaner/__init__.py new file mode 100644 index 00000000..518ac8a1 --- /dev/null +++ b/generators/text_cleaning/email_cleaner/__init__.py @@ -0,0 +1,49 @@ +from pydantic import BaseModel +import re + +INPUT_EXAMPLE = { + """Hi Sofia, +I hope this email finds you well. I have some exciting news to share with you regarding a potential new client for StellarDefense Insurance. We have recently received an application from a company called Bleyerstift and More, who are in need of insurance coverage. Bleyerstift and More is a reputable company in the manufacturing industry. They operate in the pharmaceutical sector, specializing in the production of medical supplies. With a workforce of approximately 500 employees, they are located at 123 Main Street, Anytown, USA. You can find more information about them on their website at www.bleyerstiftandmore.com. +The client has requested a submission to be completed by April 1st, 2024. They are specifically interested in obtaining a comprehensive general liability insurance policy, with a coverage limit of $1 million for each occurrence. +Please let me know if you require any additional information from them or if there are any specific questions you would like me to address. As for attachments, there is a document that provides a detailed breakdown of Bleyerstift and More's revenue and other pertinent financial information. +I have included this attachment for your reference. I believe this opportunity has great potential for StellarDefense Insurance's growth and would appreciate your assistance in handling this case. If you have any questions or need any further information, please do not hesitate to reach out to me. Thank you for your time and support in this matter. +[cid:image012915.png@C10DB1A7.DEFECF3B] +Best regards, +Amelia Smith Insurance Broker StellarDefense Insurance + +DISCLAIMER + +The information contained in this communication from the sender is confidential. It is intended solely for use by the recipient and others authorized to receive it. If you are not the recipient, you are hereby notified that any disclosure, copying, distribution or taking action in relation of the contents of this information is strictly prohibited and may be unlawful. + +This email has been scanned for viruses and malware, and may have been automatically archived by blubb. + +From: Bender, Zoe +Sent: 22 September 2022 16:55 +To: Smith, Amelia +Subject: Small question + +EXTERNAL EMAIL: This email originated from outside StellarDefense. +Dear Amelia, +I just wanted to know if you have new information for me. If I remember correctly, you told me about a great deal with a new company. Love to hear more about it. +All best +Zoe +[signature]""" +} + +class EmailCleanerModel(BaseModel): + email: str + + class Config: + schema_extra = {"example": INPUT_EXAMPLE} + + +def email_cleaner(req: EmailCleanerModel): + text = req.email + text = re.sub("DISCLAIMER((\w|\s|\S))+?(?=From:|\Z)", "",text, flags=re.IGNORECASE) + text = re.sub("EXTERNAL EMAIL.*?(?=\.)\.", "", text, flags=re.IGNORECASE) + text = re.sub("\[cid:image.*?(?=\])\]", "",text, flags=re.IGNORECASE) + text = re.sub("signature((\w|\s|\S))+?(?=From:|\Z)","",text, flags=re.IGNORECASE) + return text + + + diff --git a/generators/text_cleaning/email_cleaner/code_snippet_common.md b/generators/text_cleaning/email_cleaner/code_snippet_common.md new file mode 100644 index 00000000..3c4f17ec --- /dev/null +++ b/generators/text_cleaning/email_cleaner/code_snippet_common.md @@ -0,0 +1,48 @@ +```python +import re + +def email_cleaner(text): + text = re.sub("DISCLAIMER((\w|\s|\S))+?(?=From:|\Z)", "",text, flags=re.IGNORECASE) + text = re.sub("EXTERNAL EMAIL.*?(?=\.)\.", "", text, re.IGNORECASE) + text = re.sub("\[cid:image.*?(?=\])\]", "",text, re.IGNORECASE) + text = re.sub("signature((\w|\s|\S))+?(?=From:|\Z)","",text, re.IGNORECASE) + return text + +# ↑ necessary bricks stuff +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +emails = ["""Hi Sofia, +I hope this email finds you well. I have some exciting news to share with you regarding a potential new client for StellarDefense Insurance. We have recently received an application from a company called Bleyerstift and More, who are in need of insurance coverage. Bleyerstift and More is a reputable company in the manufacturing industry. They operate in the pharmaceutical sector, specializing in the production of medical supplies. With a workforce of approximately 500 employees, they are located at 123 Main Street, Anytown, USA. You can find more information about them on their website at www.bleyerstiftandmore.com. +The client has requested a submission to be completed by April 1st, 2024. They are specifically interested in obtaining a comprehensive general liability insurance policy, with a coverage limit of $1 million for each occurrence. +Please let me know if you require any additional information from them or if there are any specific questions you would like me to address. As for attachments, there is a document that provides a detailed breakdown of Bleyerstift and More's revenue and other pertinent financial information. +I have included this attachment for your reference. I believe this opportunity has great potential for StellarDefense Insurance's growth and would appreciate your assistance in handling this case. If you have any questions or need any further information, please do not hesitate to reach out to me. Thank you for your time and support in this matter. +[cid:image012915.png@C10DB1A7.DEFECF3B] +Best regards, +Amelia Smith Insurance Broker StellarDefense Insurance + +DISCLAIMER + +The information contained in this communication from the sender is confidential. It is intended solely for use by the recipient and others authorized to receive it. If you are not the recipient, you are hereby notified that any disclosure, copying, distribution or taking action in relation of the contents of this information is strictly prohibited and may be unlawful. + +This email has been scanned for viruses and malware, and may have been automatically archived by blubb. + +From: Bender, Zoe +Sent: 22 September 2022 16:55 +To: Smith, Amelia +Subject: Small question + +EXTERNAL EMAIL: This email originated from outside StellarDefense. +Dear Amelia, +I just wanted to know if you have new information for me. If I remember correctly, you told me about a great deal with a new company. Love to hear more about it. +All best +Zoe +[signature]"""] + +def example_integration(): + texts = emails + for text in texts: + print(f"the emails will looked cleansed like this:\n{email_cleaner(text)}") +example_integration() + +``` \ No newline at end of file diff --git a/generators/text_cleaning/email_cleaner/code_snippet_refinery.md b/generators/text_cleaning/email_cleaner/code_snippet_refinery.md new file mode 100644 index 00000000..b99633bd --- /dev/null +++ b/generators/text_cleaning/email_cleaner/code_snippet_refinery.md @@ -0,0 +1,13 @@ +```python +import re + +ATTRIBUTE: str = "headline" #only text attributes + +def email_cleaner(record): + text = record[ATTRIBUTE].text + text = re.sub("DISCLAIMER((\w|\s|\S))+?(?=From:|\Z)", "",text, flags=re.IGNORECASE) + text = re.sub("EXTERNAL EMAIL.*?(?=\.)\.", "", text, re.IGNORECASE) + text = re.sub("\[cid:image.*?(?=\])\]", "",text, re.IGNORECASE) + text = re.sub("signature((\w|\s|\S))+?(?=From:|\Z)","",text, re.IGNORECASE) + return text +``` \ No newline at end of file diff --git a/generators/text_cleaning/email_cleaner/config.py b/generators/text_cleaning/email_cleaner/config.py new file mode 100644 index 00000000..7d6a39a2 --- /dev/null +++ b/generators/text_cleaning/email_cleaner/config.py @@ -0,0 +1,35 @@ +from util.configs import build_generator_function_config +from util.enums import State, RefineryDataType, BricksVariableType, SelectionType +from . import email_cleaner, INPUT_EXAMPLE + + +def get_config(): + return build_generator_function_config( + function=email_cleaner, + input_example=INPUT_EXAMPLE, + issue_id=328, + tabler_icon="square-rounded-letter-e", + min_refinery_version="1.7.0", + state=State.PUBLIC.value, + type="python_function", + kern_token_proxy_usable="false", + docker_image="none", + available_for=["refinery", "common"], + part_of_group=[ + "text_cleaning", + ], # first entry should be parent directory + # bricks integrator information + integrator_inputs={ + "name": "email_cleaner", + "refineryDataType": RefineryDataType.TEXT.value, + "variables": { + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value, + ], + } + }, + }, + )