diff --git a/intelligence_toolkit/query_text_data/api.py b/intelligence_toolkit/query_text_data/api.py index 8c352e3..d657264 100644 --- a/intelligence_toolkit/query_text_data/api.py +++ b/intelligence_toolkit/query_text_data/api.py @@ -133,7 +133,7 @@ def process_text_chunks( self, max_cluster_size: int = 25, min_edge_weight: int = 2, - min_node_degree: int = 1, + min_node_degree: int = 2, callbacks=[], ) -> ProcessedChunks: """ diff --git a/intelligence_toolkit/query_text_data/graph_builder.py b/intelligence_toolkit/query_text_data/graph_builder.py index 0a56eb2..c1eadb8 100644 --- a/intelligence_toolkit/query_text_data/graph_builder.py +++ b/intelligence_toolkit/query_text_data/graph_builder.py @@ -28,10 +28,10 @@ def update_concept_graph_edges(node_to_period_counts, edge_to_period_counts, per nps = sorted(set(TextBlob(chunk).noun_phrases)) filtered_nps = [] for np in nps: - # split on space or newline - parts = re.split(r"[\s\n]+", np) - if all([re.match(r"[a-zA-Z0-9\-]+", part) for part in parts]): - filtered_nps.append(np) + # split on space + parts = [p for p in re.split(r"[\s]+", np) if len(p) > 0] + if len(parts) > 1 and all([re.match(r"^[a-zA-Z0-9\-]+\n?$", part) for part in parts]): + filtered_nps.append(np.replace("\n", "")) filtered_nps = sorted(filtered_nps) for np in filtered_nps: concept_to_cids[np].append(cid) @@ -87,7 +87,7 @@ def build_meta_graph(G, hierarchical_communities): return level_to_label_to_network -def prepare_concept_graph(G, min_edge_weight, min_node_degree, std_trim=4): +def prepare_concept_graph(G, min_edge_weight, min_node_degree, std_trim=3): degrees = [x[1] for x in G.degree()] mean_degree = np.mean(degrees) std_degree = np.std(degrees)