Skip to content

Commit b88a960

Browse files
author
Y
authored
fix: stripping newline and tab and tokenizing issue body (#38)
1 parent 61a914e commit b88a960

File tree

2 files changed

+9
-1
lines changed

2 files changed

+9
-1
lines changed

mine-issues.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import argparse
2+
import re
23

34
from math import ceil
45
from urllib.parse import urlencode
@@ -199,9 +200,14 @@ def sort_by_arg_checker(arg_value):
199200
# Also add each issue's body (description/first comment) to the
200201
# comment corpus array to be processed as well.
201202
issue_body = r['body'] or ""
203+
# Also check for issue's description for any CODE blocks to be tokenized.
204+
issue_body = re.sub('```([^`]*)```|`([^`]*)`', 'CODE', issue_body)
202205
issue_body_lines = issue_body.splitlines()
203206
for line in issue_body_lines:
204-
if line != "":
207+
# Strip away any new lines
208+
line = line.strip('\n')
209+
line = line.strip('\t')
210+
if line:
205211
CORPUS.append({
206212
"issueID": r['id'],
207213
"issueURL_API": r['url'],

utils/githubAPI.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ def gitHubCommentAPI(issues):
7878

7979
comment_lines = code_tokenized_comment.splitlines()
8080
for line in comment_lines:
81+
line = line.strip('\n')
82+
line = line.strip('\t')
8183
if line:
8284
results.append({
8385
"issueID": i['issueID'],

0 commit comments

Comments
 (0)