Skip to content

Commit a48b5a6

Browse files
committed
Drain and Tipping allow templates to be fetched to df
1 parent b72a4ca commit a48b5a6

1 file changed

Lines changed: 38 additions & 13 deletions

File tree

loglead/enhancer.py

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
("${start}<SEQ>${end}", r"(?P<start>[^A-Za-z0-9]|^)([0-9a-f]{6,} ?){3,}(?P<end>[^A-Za-z0-9]|$)"),
2121
("${start}<SEQ>${end}", r"(?P<start>[^A-Za-z0-9]|^)([0-9A-F]{4} ?){4,}(?P<end>[^A-Za-z0-9]|$)"),
2222
("${start}<HEX>${end}", r"(?P<start>[^A-Za-z0-9]|^)(0x[a-f0-9A-F]+)(?P<end>[^A-Za-z0-9]|$)"),
23+
# ("${start}<HEX>${end}", r"(?P<start>[^A-Za-z0-9]|^)([a-f0-9A-F]+)(?P<end>[^A-Za-z0-9]|$)"),
24+
# ("${start}<HEX>${end}", r"(?P<start>[^A-Za-z0-9]|^)(0x[a-f0-9A-F]+|[a-f0-9A-F]+)(?P<end>[^A-Za-z0-9]|$)"),
25+
# ("${start}<HEX>${end}", r"(?P<start>[^A-Za-z0-9]|^)(0x[a-f0-9A-F]{2,}(?:[a-f0-9A-F]{2})*|[a-f0-9A-F]{2}(?:[a-f0-9A-F]{2})*)(?P<end>[^A-Za-z0-9]|$)"),
2326
("${start}<NUM>${end}", r"(?P<start>[^A-Za-z0-9]|^)([\-\+]?\d+)(?P<end>[^A-Za-z0-9]|$)"),
2427
("${cmd}<CMD>", r"(?P<cmd>executed cmd )(\".+?\")")
2528
]
@@ -85,7 +88,7 @@ def _create_cngram(self, message, ngram=3):
8588
return [message[i:i + ngram] for i in range(len(message) - ngram + 1)]
8689

8790
# Enrich with drain parsing results
88-
def parse_drain(self, field = "e_message_normalized", drain_masking=False, reparse=False):
91+
def parse_drain(self, field = "e_message_normalized", drain_masking=False, reparse=False, templates=False):
8992
self._handle_prerequisites([field])
9093
if reparse or "e_event_drain_id" not in self.df.columns:
9194
import drain3 as dr
@@ -125,10 +128,15 @@ def parse_drain(self, field = "e_message_normalized", drain_masking=False, repar
125128
self.df = self.df.with_columns(
126129
drain=pl.col(field).map_elements(lambda x: tm.add_log_message(x), return_dtype=return_dtype))
127130

128-
self.df = self.df.with_columns(
129-
# extra letter to ensure we e1 e2 instead of 1 2
130-
e_event_drain_id=pl.lit("e") + pl.col("drain").struct.field("cluster_id").cast(pl.Utf8),
131-
e_template=pl.col("drain").struct.field("template_mined"))
131+
if templates:
132+
self.df = self.df.with_columns(
133+
# extra letter to ensure we get e1 e2 instead of 1 2
134+
e_event_drain_id=pl.lit("e") + pl.col("drain").struct.field("cluster_id").cast(pl.Utf8),
135+
e_event_drain_template=pl.col("drain").struct.field("template_mined"))
136+
else:
137+
self.df = self.df.with_columns(
138+
# extra letter to ensure we get e1 e2 instead of 1 2
139+
e_event_drain_id=pl.lit("e") + pl.col("drain").struct.field("cluster_id").cast(pl.Utf8))
132140
self.df = self.df.drop("drain") # Drop the dictionary produced by drain. Event_id and template are the most important.
133141
# tm.drain.print_tree()
134142
return self.df
@@ -159,7 +167,7 @@ def parse_ael(self,field = "e_message_normalized", reparse=False):
159167
return self.df
160168

161169
#New parser not yet released to public. Coming early 2024
162-
def parse_tip(self, field = "e_message_normalized", reparse=False):
170+
def parse_tip(self, field = "e_message_normalized", reparse=False, templates=False):
163171
self._handle_prerequisites([field])
164172
if reparse or "e_event_tip_id" not in self.df.columns:
165173
if "e_event_tip_id" in self.df.columns:
@@ -168,12 +176,27 @@ def parse_tip(self, field = "e_message_normalized", reparse=False):
168176
if "row_nr" in self.df.columns:
169177
self.df = self.df.drop("row_nr")
170178
self.df = self.df.with_row_count()
171-
tipping_clusters = tip.parse(self.df[field], return_templates=False, return_masks=False)
172-
df_new = pl.DataFrame(
173-
{
174-
"e_event_tip_id": tipping_clusters[0],
175-
}
176-
)
179+
tipping_clusters, tipping_masks, tipping_templates = tip.parse(self.df[field], return_templates=templates, return_masks=False)
180+
if templates:
181+
df_new = pl.DataFrame(
182+
{
183+
"e_event_tip_id": tipping_clusters,
184+
}
185+
)
186+
#convert sets to lists
187+
tipping_templates = [list(s)[0] if s else None for s in tipping_templates]
188+
df_templates = pl.DataFrame({
189+
"e_event_tip_id": range(len(tipping_templates)),
190+
"e_event_tip_template": tipping_templates
191+
})
192+
df_new = df_new.join(df_templates, on="e_event_tip_id", how="left")
193+
df_new = df_new.with_columns(pl.col("e_event_tip_template").cast(pl.Utf8))
194+
else:
195+
df_new = pl.DataFrame(
196+
{
197+
"e_event_tip_id": tipping_clusters,
198+
}
199+
)
177200
df_new = df_new.with_columns(
178201
e_event_tip_id=pl.when(pl.col("e_event_tip_id").is_null())
179202
.then(pl.lit("e_null"))
@@ -349,7 +372,9 @@ def normalize(self, regexs=masking_patterns_drain, to_lower=False, twice=True):
349372
# print (base_code)
350373
# return base_code
351374

352-
def item_cumsum2(self, column="e_message_normalized", chronological_order=1, ano_only=True, unique_only=True, out_column=""):
375+
def item_cumsum2(self, column="e_message_normalized", chronological_order=1, ano_only=True, unique_only=True, out_column=None):
376+
if out_column is None:
377+
out_column = column + "_cumsum"
353378
column_name = out_column
354379
self._handle_prerequisites([column, 'm_timestamp'])
355380
if ano_only:

0 commit comments

Comments
 (0)