2020 ("${start}<SEQ>${end}" , r"(?P<start>[^A-Za-z0-9]|^)([0-9a-f]{6,} ?){3,}(?P<end>[^A-Za-z0-9]|$)" ),
2121 ("${start}<SEQ>${end}" , r"(?P<start>[^A-Za-z0-9]|^)([0-9A-F]{4} ?){4,}(?P<end>[^A-Za-z0-9]|$)" ),
2222 ("${start}<HEX>${end}" , r"(?P<start>[^A-Za-z0-9]|^)(0x[a-f0-9A-F]+)(?P<end>[^A-Za-z0-9]|$)" ),
23+ # ("${start}<HEX>${end}", r"(?P<start>[^A-Za-z0-9]|^)([a-f0-9A-F]+)(?P<end>[^A-Za-z0-9]|$)"),
24+ # ("${start}<HEX>${end}", r"(?P<start>[^A-Za-z0-9]|^)(0x[a-f0-9A-F]+|[a-f0-9A-F]+)(?P<end>[^A-Za-z0-9]|$)"),
25+ # ("${start}<HEX>${end}", r"(?P<start>[^A-Za-z0-9]|^)(0x[a-f0-9A-F]{2,}(?:[a-f0-9A-F]{2})*|[a-f0-9A-F]{2}(?:[a-f0-9A-F]{2})*)(?P<end>[^A-Za-z0-9]|$)"),
2326 ("${start}<NUM>${end}" , r"(?P<start>[^A-Za-z0-9]|^)([\-\+]?\d+)(?P<end>[^A-Za-z0-9]|$)" ),
2427 ("${cmd}<CMD>" , r"(?P<cmd>executed cmd )(\".+?\")" )
2528]
@@ -85,7 +88,7 @@ def _create_cngram(self, message, ngram=3):
8588 return [message [i :i + ngram ] for i in range (len (message ) - ngram + 1 )]
8689
8790 # Enrich with drain parsing results
88- def parse_drain (self , field = "e_message_normalized" , drain_masking = False , reparse = False ):
91+ def parse_drain (self , field = "e_message_normalized" , drain_masking = False , reparse = False , templates = False ):
8992 self ._handle_prerequisites ([field ])
9093 if reparse or "e_event_drain_id" not in self .df .columns :
9194 import drain3 as dr
@@ -125,10 +128,15 @@ def parse_drain(self, field = "e_message_normalized", drain_masking=False, repar
125128 self .df = self .df .with_columns (
126129 drain = pl .col (field ).map_elements (lambda x : tm .add_log_message (x ), return_dtype = return_dtype ))
127130
128- self .df = self .df .with_columns (
129- # extra letter to ensure we e1 e2 instead of 1 2
130- e_event_drain_id = pl .lit ("e" ) + pl .col ("drain" ).struct .field ("cluster_id" ).cast (pl .Utf8 ),
131- e_template = pl .col ("drain" ).struct .field ("template_mined" ))
131+ if templates :
132+ self .df = self .df .with_columns (
133+ # extra letter to ensure we get e1 e2 instead of 1 2
134+ e_event_drain_id = pl .lit ("e" ) + pl .col ("drain" ).struct .field ("cluster_id" ).cast (pl .Utf8 ),
135+ e_event_drain_template = pl .col ("drain" ).struct .field ("template_mined" ))
136+ else :
137+ self .df = self .df .with_columns (
138+ # extra letter to ensure we get e1 e2 instead of 1 2
139+ e_event_drain_id = pl .lit ("e" ) + pl .col ("drain" ).struct .field ("cluster_id" ).cast (pl .Utf8 ))
132140 self .df = self .df .drop ("drain" ) # Drop the dictionary produced by drain. Event_id and template are the most important.
133141 # tm.drain.print_tree()
134142 return self .df
@@ -159,7 +167,7 @@ def parse_ael(self,field = "e_message_normalized", reparse=False):
159167 return self .df
160168
161169 #New parser not yet released to public. Coming early 2024
162- def parse_tip (self , field = "e_message_normalized" , reparse = False ):
170+ def parse_tip (self , field = "e_message_normalized" , reparse = False , templates = False ):
163171 self ._handle_prerequisites ([field ])
164172 if reparse or "e_event_tip_id" not in self .df .columns :
165173 if "e_event_tip_id" in self .df .columns :
@@ -168,12 +176,27 @@ def parse_tip(self, field = "e_message_normalized", reparse=False):
168176 if "row_nr" in self .df .columns :
169177 self .df = self .df .drop ("row_nr" )
170178 self .df = self .df .with_row_count ()
171- tipping_clusters = tip .parse (self .df [field ], return_templates = False , return_masks = False )
172- df_new = pl .DataFrame (
173- {
174- "e_event_tip_id" : tipping_clusters [0 ],
175- }
176- )
179+ tipping_clusters , tipping_masks , tipping_templates = tip .parse (self .df [field ], return_templates = templates , return_masks = False )
180+ if templates :
181+ df_new = pl .DataFrame (
182+ {
183+ "e_event_tip_id" : tipping_clusters ,
184+ }
185+ )
186+ #convert sets to lists
187+ tipping_templates = [list (s )[0 ] if s else None for s in tipping_templates ]
188+ df_templates = pl .DataFrame ({
189+ "e_event_tip_id" : range (len (tipping_templates )),
190+ "e_event_tip_template" : tipping_templates
191+ })
192+ df_new = df_new .join (df_templates , on = "e_event_tip_id" , how = "left" )
193+ df_new = df_new .with_columns (pl .col ("e_event_tip_template" ).cast (pl .Utf8 ))
194+ else :
195+ df_new = pl .DataFrame (
196+ {
197+ "e_event_tip_id" : tipping_clusters ,
198+ }
199+ )
177200 df_new = df_new .with_columns (
178201 e_event_tip_id = pl .when (pl .col ("e_event_tip_id" ).is_null ())
179202 .then (pl .lit ("e_null" ))
@@ -349,7 +372,9 @@ def normalize(self, regexs=masking_patterns_drain, to_lower=False, twice=True):
349372 # print (base_code)
350373 # return base_code
351374
352- def item_cumsum2 (self , column = "e_message_normalized" , chronological_order = 1 , ano_only = True , unique_only = True , out_column = "" ):
375+ def item_cumsum2 (self , column = "e_message_normalized" , chronological_order = 1 , ano_only = True , unique_only = True , out_column = None ):
376+ if out_column is None :
377+ out_column = column + "_cumsum"
353378 column_name = out_column
354379 self ._handle_prerequisites ([column , 'm_timestamp' ])
355380 if ano_only :
0 commit comments