-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing_topic_models.py
26 lines (23 loc) · 1.14 KB
/
preprocessing_topic_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()
def get_part_of_speech(word):
probable_part_of_speech = wordnet.synsets(word)
pos_counts = Counter()
pos_counts["n"] = len( [ item for item in probable_part_of_speech if item.pos()=="n"] )
pos_counts["v"] = len( [ item for item in probable_part_of_speech if item.pos()=="v"] )
pos_counts["a"] = len( [ item for item in probable_part_of_speech if item.pos()=="a"] )
pos_counts["r"] = len( [ item for item in probable_part_of_speech if item.pos()=="r"] )
most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
return most_likely_part_of_speech
def preprocess_text(text):
cleaned = re.sub(r'\W+', ' ', text).lower()
tokenized = word_tokenize(cleaned)
normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]
filtered = [word for word in normalized if word not in stop_words]
return " ".join(filtered)