From a27c176d13ecc64c5279493449beada27fe92951 Mon Sep 17 00:00:00 2001 From: gastonq Date: Tue, 16 Jul 2019 15:14:54 +0000 Subject: [PATCH 1/4] commit --- 01_NLP_Annotations.ipynb | 74 +++++- 04_NLP_Evaluation_Metrics.ipynb | 6 +- 06_Keyword_Searching_and_ErrorAnalysis.ipynb | 215 +++++++++--------- ...onia_pyConText_targets_and_modifiers.ipynb | 188 ++++++++++----- 10_NLP_DocumentClassification.ipynb | 121 +++++----- KB/pneumonia_modifiers.yml | 32 ++- KB/pneumonia_targets.yml | 2 +- 7 files changed, 402 insertions(+), 236 deletions(-) diff --git a/01_NLP_Annotations.ipynb b/01_NLP_Annotations.ipynb index 69a3782..302c2ec 100644 --- a/01_NLP_Annotations.ipynb +++ b/01_NLP_Annotations.ipynb @@ -10,11 +10,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.\n", + " \"`IPython.html.widgets` has moved to `ipywidgets`.\", ShimWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded utilities...\n" + ] + } + ], "source": [ "import urllib.request\n", "import os\n", @@ -51,11 +67,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading annotations from file : data/training_v2.zip\n", + "Opening local file : data/training_v2.zip\n", + "Total Annotated Documents : 70\n", + "Total Positive Pneumonia Documents : 34\n" + ] + } + ], "source": [ "# First thing, let's load our training set\n", "annotated_doc_map = read_doc_annotations('data/training_v2.zip')\n", @@ -70,6 +97,23 @@ "print('Total Positive Pneumonia Documents : {0}'.format(total_positives))" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total Positive Pneumonia Documents : HOLA\n" + ] + } + ], + "source": [ + "print('Total Positive Pneumonia Documents : {0}{1}{2}{3}'.format(\"H\",\"O\",\"L\",\"A\"))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -99,9 +143,8 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { - "collapsed": true, "scrolled": true }, "outputs": [], @@ -117,11 +160,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4c896d7363204445a395324af7a18c08", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(IntSlider(value=0, description='i', max=69), Output()), _dom_classes=('widget-interact',…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "view_annotation_markup(annotated_docs)" ] diff --git a/04_NLP_Evaluation_Metrics.ipynb b/04_NLP_Evaluation_Metrics.ipynb index f244f60..0e0b1fd 100644 --- a/04_NLP_Evaluation_Metrics.ipynb +++ b/04_NLP_Evaluation_Metrics.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -48,7 +48,7 @@ "" ] }, - "execution_count": 11, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } diff --git a/06_Keyword_Searching_and_ErrorAnalysis.ipynb b/06_Keyword_Searching_and_ErrorAnalysis.ipynb index d9998b9..e8d19ec 100644 --- a/06_Keyword_Searching_and_ErrorAnalysis.ipynb +++ b/06_Keyword_Searching_and_ErrorAnalysis.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,10 @@ " self.keywords = set()\n", " def predict(self, text):\n", " prediction = 0\n", - "# your code here\n", + " #for word in text.split():\n", + " for keyword in self.keywords:\n", + " if keyword in text:\n", + " return 1\n", " return prediction\n", " \n" ] @@ -72,12 +75,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Test the function you just wrote by adding one keyword to the set: 'pneumonia'" + "Test the function you just wrote by adding one keyword to the set: 'pneumonia'\n" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 61, "metadata": {}, "outputs": [ { @@ -95,6 +98,11 @@ "source": [ "keyword_classifier = KeywordClassifier()\n", "keyword_classifier.keywords.add('pneumonia')\n", + "\n", + "#keyword_classifier.keywords.add('consolidation')\n", + "#keyword_classifier.keywords.add('infiltrate')\n", + "#keyword_classifier.keywords.add('fever')\n", + "#keyword_classifier.keywords.add('cough')\n", "annotated_doc_map = read_doc_annotations('data/training_v2.zip')\n", "print('Total Annotated Documents : {0}'.format(len(annotated_doc_map)))\n", "\n", @@ -123,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -139,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -172,13 +180,13 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ef07cb4a927f4b829c5aecd3541c90ca", + "model_id": "990476361b984d86ab280c0b5d512c63", "version_major": 2, "version_minor": 0 }, @@ -212,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -264,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 66, "metadata": { "scrolled": false }, @@ -272,37 +280,77 @@ { "data": { "text/html": [ - "
document nameSnippets
subject_id_6349_hadm_id_20192
There is no pulmonary vascular\n", - " engorgement. There is an increasing left-sided pleural effusion with\n", - " associated atelectasis. Pneumonia at this location cannot be excluded. Noted\n", - " is a density in the left upper lung z
subject_id_157_hadm_id_26180
ung is incompletely imaged\n", - " on this study and there is a questionable area of abnormality partially\n", - " obscuring the mid portion of the right hemidiaphragm, incompletely evaluated.\n", - " \n", - " IMPRESSION:
subject_id_150_hadm_id_12121
es are\n", + "
document nameSnippets
subject_id_146_hadm_id_18965
al effusion.\n", + " Right CPA not included on film. There is obscuration of left hemidiaphragm\n", + " likely secondary to atelectasis/consolidation in left lower lobe.\n", + "\n", + "
subject_id_150_hadm_id_12121
es are\n", " unremarkable.\n", " \n", " IMPRESSION: Small focal opacity in right upper lobe and right paratracheal\n", " opacity. In the sett
CHEST PA AND LATERAL: The heart size is normal. There is an area of\n", " increased opacity lateral to the right paratracheal stripe. In the
pacity lateral to the right paratracheal stripe. In the right\n", " upper lobe, there is a small focal opacity. The lungs are otherwise clear.\n", - " There are no
subject_id_7675_hadm_id_3
r contours are stable\n", + " There are no
subject_id_157_hadm_id_26180
ung is incompletely imaged\n", + " on this study and there is a questionable area of abnormality partially\n", + " obscuring the mid portion of the right hemidiaphragm, incompletely evaluated.\n", + " \n", + " IMPRESSION:
subject_id_261_hadm_id_19250
ignificant change compared with [**3197-12-9**]. Bilateral pulmonary\n", + " opacities involving the lower and mid lung zones.\n", + "\n", + "
c\n", + " angle is not included in this radiograph. There is left lower lobe\n", + " consolidation/collapse. There is bilateral lower zone and mid zone\n", + "
is left lower lobe\n", + " consolidation/collapse. There is bilateral lower zone and mid zone\n", + " infiltrate. The heart size is slightly enlarged.\n", + " \n", + "
subject_id_5472_hadm_id_11987
id SVC. \n", + " There is no apparent pneumothorax. A right IJ line, NGT, and ETT are\n", + " unchanged as are the parenchymal changes in the lungs compared to the earlier\n", + " chest x-ray this mor
subject_id_6349_hadm_id_20192
There is no pulmonary vascular\n", + " engorgement. There is an increasing left-sided pleural effusion with\n", + " associated atelectasis. Pneumonia at this location cannot be excluded. Noted\n", + " is a density in the left upper lung z
subject_id_7027_hadm_id_33117
ossibility of free\n", + " intraperitoneal air.\n", + " 2) Left lower lobe atelectasis/consolidation.\n", + " 3) Moderate gastric distention with multiple
stinal and hilar contours are\n", + " unremarkable. There is patchy opacity at the left lower lobe representing\n", + " either atelectasis or consolidation. No definite free air is identified,\n", + " howeve
subject_id_7272_hadm_id_19098
rt failure with bilateral pleural effusions.\n", + " Collapse and/or consolidation at the bases bilaterally.\n", + "\n", + "
rall heart size is difficult to assess. There is dense retrocardiac\n", + " opacity, possibly secondary to collapse and/or consolidation in the left lower\n", + " lobe. There is also a rig
ion in the left lower\n", + " lobe. There is also a right lower lobe and middle lobe opacity consistent\n", + " with collapse and/or consolidation.\n", + " \n", + " IMPRESSION: Persistent left heart fai
subject_id_7525_hadm_id_19141
n distal superior vena cava, unchanged. There is marked\n", + " improvement of the bilateral consolidations, especially on the right. The NG\n", + " tube tip is
subject_id_7675_hadm_id_3
r contours are stable\n", " since the prior study. There is persistent opacification in the right lower\n", " lung zone. The pulmonary vascularity is unremarkable. The e
terminating in the left IJ. No\n", " pneumothorax. Persistent right lower lung zone opacification and probable\n", " small effusion.\n", "\n", - "
subject_id_9082_hadm_id_29395
tient with seizure.\n", + "
subject_id_8494_hadm_id_20131
lung and right mediastinal\n", + " lymphadenopathy. Patchy opacities are seen throughout the right lung which\n", + " could be due to atelectasis from compression of the right lung or represent\n", + " air space consolidation. An ET tube is identified with tip 6.3 cm from th
subject_id_8734_hadm_id_8478
Fluid overload vs. congestive failure.\n", + " 2) Possible developing left infiltrate.\n", + "\n", + "
minent, consistent with fluid overload. There is increased opacity within\n", + " the left lower lung. It is not clear if this is related to technique,\n", + " overlying soft tissue, or an underlying infiltrate. There is no definite\n", + " pleural effusion. The
subject_id_9082_hadm_id_29395
tient with seizure.\n", " \n", " Low lung volumes. Bilateral basilar opacities, considerably larger at the\n", " left base than at
hyroid.\n", " \n", " IMPRESSION: Lung volumes with bilateral basilar opacities.\n", " \n", - " Question substernal thyroid enlargemen
subject_id_5472_hadm_id_11987
id SVC. \n", - " There is no apparent pneumothorax. A right IJ line, NGT, and ETT are\n", - " unchanged as are the parenchymal changes in the lungs compared to the earlier\n", - " chest x-ray this mor
" + " Question substernal thyroid enlargemen
" ], "text/plain": [ "" @@ -315,7 +363,16 @@ "source": [ "fn=list_false_negatives(annotated_doc_map, keyword_classifier.predict)\n", "docs=list(fn.keys())\n", - "display(HTML(snippets_markup(fn)))" + "display(HTML(snippets_markup(fn)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(fn)" ] }, { @@ -327,19 +384,19 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'pneumonia', 'consolidation'}\n" + "{'pneumonia', 'Pneumonia', 'consolidation', 'infiltrate', 'Consolidation', 'consolidations'}\n" ] } ], "source": [ - "keyword_classifier.keywords = {'pneumonia', 'consolidation'}\n", + "keyword_classifier.keywords = {'pneumonia', 'consolidation','Pneumonia','infiltrate','Consolidation','consolidations'}\n", "print(keyword_classifier.keywords)" ] }, @@ -354,16 +411,16 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Precision : 0.6829268292682927\n", - "Recall : 0.8235294117647058\n", - "F1: 0.7466666666666667\n", + "Precision : 0.6444444444444445\n", + "Recall : 0.8529411764705882\n", + "F1: 0.7341772151898734\n", "\n", "Confusion Matrix : \n" ] @@ -401,13 +458,13 @@ " \n", " \n", " 0\n", - " 23\n", - " 13\n", + " 20\n", + " 16\n", " \n", " \n", " 1\n", - " 6\n", - " 28\n", + " 5\n", + " 29\n", " \n", " \n", "\n", @@ -416,8 +473,8 @@ "text/plain": [ "Predicted 0 1\n", "Actual \n", - "0 23 13\n", - "1 6 28" + "0 20 16\n", + "1 5 29" ] }, "metadata": {}, @@ -438,32 +495,16 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 67, "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fa405de456344aa3ac6532dd70750283", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "RadioButtons(description='False negative means:', layout=Layout(width='600px'), options=('Negative in both gol…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9caec9fef713456f83e5f5d77c42c59b", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Correct!" + ], "text/plain": [ - "Button(description='Submit', style=ButtonStyle())" + "" ] }, "metadata": {}, @@ -477,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 70, "metadata": {}, "outputs": [ { @@ -494,27 +535,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2d39aea50c9342769af995c43a14d289", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "RadioButtons(description='Which is corret:', layout=Layout(width='600px'), options=('list', 'array', 'dictiona…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "909fc3ee048746149285e600f22e6cc3", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Correct!" + ], "text/plain": [ - "Button(description='Submit', style=ButtonStyle())" + "" ] }, "metadata": {}, @@ -528,7 +553,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -547,27 +572,11 @@ }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5a0d8f1daf724392bf7a687d643a4413", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "RadioButtons(description='Choose best keyword:', layout=Layout(width='600px'), options=('is', 'lobe', 'patchy …" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b9bc4b1cdcc349a7911085ac889ef568", - "version_major": 2, - "version_minor": 0 - }, + "text/html": [ + "Correct" + ], "text/plain": [ - "Button(description='Submit', style=ButtonStyle())" + "" ] }, "metadata": {}, diff --git a/09_NLP_pneumonia_pyConText_targets_and_modifiers.ipynb b/09_NLP_pneumonia_pyConText_targets_and_modifiers.ipynb index d2a5183..0f47564 100644 --- a/09_NLP_pneumonia_pyConText_targets_and_modifiers.ipynb +++ b/09_NLP_pneumonia_pyConText_targets_and_modifiers.ipynb @@ -152,13 +152,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "NodeDataView({ 99914448661365161132906157324590123994 pneumonia ['evidence_of_pneumonia'] : {'category': 'target'}})" + "NodeDataView({ 249612704042745789883698965661836381159 pneumonia ['evidence_of_pneumonia'] : {'category': 'target'}})" ] }, "metadata": {}, @@ -187,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -213,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -222,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -247,12 +247,13 @@ "source": [ "## We didn't mark up a target for \"pneumonias\" since we only had the singular variant \"pneumonia\"\n", "## We can augment our targets by modifying a yaml file (.yml). A starter yaml file is included in our course resources:\n", - "KB/pneumonia_targets.yml" + "KB/pneumonia_targets.yml\n", + "#/edit/decart_rule_based_nlp/KB/pneumonia_targets.yml" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -319,7 +320,7 @@ "2 infiltrate EVIDENCE_OF_PNEUMONIA" ] }, - "execution_count": 8, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -337,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -345,13 +346,13 @@ "output_type": "stream", "text": [ "Loading pneumonia;targets from : \n", - "\t/home/jianlins/work/decart_rule_based_nlp/KB/pneumonia_targets.yml\n" + "\t/home/gastonq/decart_rule_based_nlp/KB/pneumonia_targets.yml\n" ] }, { "data": { "text/plain": [ - "[literal<>; category<<['evidence_of_pneumonia']>>; re<<>>; rule<<>>,\n", + "[literal<>; category<<['evidence_of_pneumonia']>>; re<<>>; rule<<>>,\n", " literal<>; category<<['evidence_of_pneumonia']>>; re<<\\bpneumonia[s]?\\b>>; rule<<>>,\n", " literal<>; category<<['evidence_of_pneumonia']>>; re<<>>; rule<<>>]" ] @@ -370,20 +371,49 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/gastonq/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "nltk.download('punkt')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Loading pneumonia;targets from : /home/jianlins/work/decart_rule_based_nlp/KB/pneumonia_targets.yml\n" + "Loading pneumonia;targets from : /home/gastonq/decart_rule_based_nlp/KB/pneumonia_targets.yml\n" ] }, { "data": { "text/plain": [ - "NodeDataView({ 100246180146938886484983829586404969434 pneumonia ['evidence_of_pneumonia'] : {'category': 'target'}, 100245844219529826004192432960055544794 infiltrate ['evidence_of_pneumonia'] : {'category': 'target'}})" + "NodeDataView({ 138546912158370067406741393043076680679 pneumonia ['evidence_of_pneumonia'] : {'category': 'target'}, 138546691111796652609239507055455243239 consolidation ['evidence_of_pneumonia'] : {'category': 'target'}, 138546489872263866377822019453821389799 infiltrate ['evidence_of_pneumonia'] : {'category': 'target'}})" ] }, "metadata": {}, @@ -395,6 +425,7 @@ "targets2 = []\n", "modifiers2 = []\n", "\n", + "#pneumonia_targets_file = 'gastonq/edit/decart_rule_based_nlp/KB/pneumonia_targets.yml'\n", "pneumonia_targets_file = 'KB/pneumonia_targets.yml'\n", "\n", "# so now let's set this up with more variants of \"EVIDENCE_OF_PNEUMONIA\"\n", @@ -418,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -445,7 +476,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -477,7 +508,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -520,7 +551,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -529,7 +560,7 @@ "'INCORRECT. Please try again. See the documentation above for pyConText itemData ordering'" ] }, - "execution_count": 14, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -584,7 +615,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -661,9 +692,9 @@ "text": [ "****************\n", "Performance for Classifier 2 : 3 total Targets\n", - "Precision : 0.5185185185185185\n", + "Precision : 0.6666666666666666\n", "Recall : 0.8235294117647058\n", - "F1: 0.6363636363636364\n", + "F1: 0.7368421052631577\n", "\n", "Confusion Matrix : \n" ] @@ -701,8 +732,8 @@ " \n", " \n", " 0\n", - " 10\n", - " 26\n", + " 22\n", + " 14\n", " \n", " \n", " 1\n", @@ -716,7 +747,7 @@ "text/plain": [ "Predicted False True \n", "Actual \n", - "0 10 26\n", + "0 22 14\n", "1 6 28" ] }, @@ -791,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -828,7 +859,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -838,7 +869,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -856,7 +887,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -889,7 +920,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -922,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -955,7 +986,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -989,7 +1020,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -998,7 +1029,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -1020,7 +1051,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -1052,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -1093,22 +1124,22 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'INCORRECT. It is not clear what you passed in. Please see the list of possible answers'" + "'CORRECT. Since this will modifier targets before it, it would properly modify pnuemonia in the sentence : \"Pneumonia was ruled out\"'" ] }, - "execution_count": 28, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "modifier_directionality_quiz('UPDATE_ME')" + "modifier_directionality_quiz('backward')" ] }, { @@ -1120,14 +1151,14 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Total Modifiers Loaded for pipeline #3 : [8]\n", + "Total Modifiers Loaded for pipeline #3 : [12]\n", "Total Targets Loaded for pipeline #3 : [3]\n" ] } @@ -1153,9 +1184,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "

PORTABLE CHEST: Comparison made to prior film from X:XX a.m. the same day. The ET tube and nasogastric tube remain in good position. Cardiac and mediastinal contours are stable. No acute changes are seen within the lung parenchyma; specifically, there is no evidence of new infiltrate (skin folds do project over the right lung). No consolidation on either side. IMPRESSION: No evidence of pneumonia.

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# prepare some colors for displaying any markup we might see\n", "colors = {\n", @@ -1178,9 +1222,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 85, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\t\t\t " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# And use Brat style display:\n", "view_pycontext_output(context3)" @@ -1195,9 +1255,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 86, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Marking up all documents...\n", + "DONE Marking up all documents...\n", + "CPU times: user 872 ms, sys: 28.4 ms, total: 900 ms\n", + "Wall time: 882 ms\n" + ] + } + ], "source": [ "%%time\n", "# NOTE : This is a \"magic\" command to Jupyter to time the execution of this entire cell\n", @@ -1215,7 +1286,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1238,9 +1309,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 88, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fac3a32dca1b4e7a89de23597c7715be", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(IntSlider(value=0, description='i', max=69), Output()), _dom_classes=('widget-interact',…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "view_pycontext_graph(report_results)" ] diff --git a/10_NLP_DocumentClassification.ipynb b/10_NLP_DocumentClassification.ipynb index 9d7c288..27f7b6c 100644 --- a/10_NLP_DocumentClassification.ipynb +++ b/10_NLP_DocumentClassification.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -73,14 +73,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\t\t\t " ], @@ -164,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -188,14 +188,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\t\t\t " ], @@ -215,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -225,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -270,43 +270,23 @@ " \n", " 1\n", " T1\n", - " Modifier\n", - " 29\n", - " 38\n", - " excluded\n", - " definite_negated_existence\n", - " \n", - " \n", - " 2\n", - " T2\n", " Target\n", " 66\n", " 74\n", " effusion\n", " evidence_of_pneumonia\n", " \n", - " \n", - " 3\n", - " T3\n", - " Modifier\n", - " 40\n", - " 46\n", - " Likely\n", - " probable_existence\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " markup_id vis_category start end txt type\n", - "0 T0 Target 14 22 effusion evidence_of_pneumonia\n", - "1 T1 Modifier 29 38 excluded definite_negated_existence\n", - "2 T2 Target 66 74 effusion evidence_of_pneumonia\n", - "3 T3 Modifier 40 46 Likely probable_existence" + " markup_id vis_category start end txt type\n", + "0 T0 Target 14 22 effusion evidence_of_pneumonia\n", + "1 T1 Target 66 74 effusion evidence_of_pneumonia" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -318,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "scrolled": true }, @@ -353,35 +333,17 @@ " \n", " \n", " \n", - " \n", - " 0\n", - " R0\n", - " definite_negated_existence\n", - " Modifier\n", - " T1\n", - " Target\n", - " T0\n", - " \n", - " \n", - " 1\n", - " R1\n", - " probable_existence\n", - " Modifier\n", - " T3\n", - " Target\n", - " T2\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " relation_id type arg1_cate arg1_id arg2_cate arg2_id\n", - "0 R0 definite_negated_existence Modifier T1 Target T0\n", - "1 R1 probable_existence Modifier T3 Target T2" + "Empty DataFrame\n", + "Columns: [relation_id, type, arg1_cate, arg1_id, arg2_cate, arg2_id]\n", + "Index: []" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -400,16 +362,16 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['pos_evidence', 'pos_evidence', 'pos_evidence', 'pos_evidence']" + "['pos_evidence', 'pos_evidence']" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -428,16 +390,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'pneumonia_doc_no'" + "'pneumonia_doc_yes'" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -457,9 +419,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'pneumonia_doc_yes'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "doc_conclusion = classifier.classify_doc(report)\n", "doc_conclusion" @@ -474,9 +447,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\t\t\t " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "view_pycontext_output(classifier.get_last_context_doc())" ] diff --git a/KB/pneumonia_modifiers.yml b/KB/pneumonia_modifiers.yml index fc12883..e9c89b7 100644 --- a/KB/pneumonia_modifiers.yml +++ b/KB/pneumonia_modifiers.yml @@ -11,16 +11,22 @@ Regex: '' Type: DEFINITE_NEGATED_EXISTENCE --- Comments: '' +Direction: forward +Lex: no +Regex: '\b(NO)|(no)\b' +Type: DEFINITE_NEGATED_EXISTENCE +--- +Comments: '' Direction: backward Lex: cannot totally be excluded -Regex: '' +Regex: 'cannot (totally )?be excluded' Type: PROBABLE_NEGATED_EXISTENCE --- Comments: '' Direction: backward Lex: could be ruled out Regex: '' -Type: DEFINITE_NEGATED_EXISTENCE +Type: PROBABLE_NEGATED_EXISTENCE --- Comments: '' Direction: terminate @@ -35,13 +41,31 @@ Regex: '' Type: CONJ --- Comments: '' +Direction: terminate +Lex: however +Regex: '' +Type: CONJ +--- +Comments: '' Direction: bidirectional Lex: decrease -Regex: decrease|decreased|decreasing|reduction|reduced|resolved +Regex: decrease|decreased|decreasing|reduction|reduced|history of Type: HISTORICAL --- Comments: '' Direction: bidirectional Lex: risk factor Regex: risk factor(s)? -Type: FUTURE \ No newline at end of file +Type: FUTURE +--- +Comments: '' +Direction: backward +Lex: R/O +Regex: '' +Type: INIDCATION +--- +Comments: '' +Direction: backward +Lex: REASON FOR THIS EXAMINATION +Regex: '' +Type: INIDCATION \ No newline at end of file diff --git a/KB/pneumonia_targets.yml b/KB/pneumonia_targets.yml index c0ad66b..99c7274 100644 --- a/KB/pneumonia_targets.yml +++ b/KB/pneumonia_targets.yml @@ -1,6 +1,6 @@ Comments: '' Direction: '' -Lex: effusion +Lex: consolidation Regex: '' Type: EVIDENCE_OF_PNEUMONIA --- From 2ecbd9e28c5eb2fdee52136b9b9e93117b4e7644 Mon Sep 17 00:00:00 2001 From: gastonq Date: Wed, 17 Jul 2019 15:56:55 +0000 Subject: [PATCH 2/4] commit --- 10_NLP_DocumentClassification.ipynb | 110 +++++++++++++++------- 11_NLP_ErrorAnalysis2.ipynb | 140 ++++++++++++++-------------- 2 files changed, 146 insertions(+), 104 deletions(-) diff --git a/10_NLP_DocumentClassification.ipynb b/10_NLP_DocumentClassification.ipynb index 27f7b6c..65d0b6c 100644 --- a/10_NLP_DocumentClassification.ipynb +++ b/10_NLP_DocumentClassification.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -55,14 +55,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Let's just consider the example at the beginning as a document,\n", "# and run pyConText to get markups\n", "\n", - "report = \"Right pleural effusion can be excluded. Likely small left pleural effusion. \"\n", + "report = \"NO Right pleural effusion can be excluded. NO Likely small left pleural effusion. \"\n", "\n", "targets = itemData([\"effusion\", \"EVIDENCE_OF_PNEUMONIA\", r\"effusion[s]?\", \"\"])\n", "\n", @@ -73,14 +73,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\t\t\t " ], @@ -94,8 +94,7 @@ ], "source": [ "# To confirm what we get from pyConText\n", - "view_pycontext_output(markups)\n", - " \n" + "view_pycontext_output(markups)\n" ] }, { @@ -164,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -188,14 +187,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\t\t\t " ], @@ -215,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -225,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -262,31 +261,51 @@ " 0\n", " T0\n", " Target\n", - " 14\n", - " 22\n", + " 17\n", + " 25\n", " effusion\n", " evidence_of_pneumonia\n", " \n", " \n", " 1\n", " T1\n", + " Modifier\n", + " 0\n", + " 2\n", + " NO\n", + " definite_negated_existence\n", + " \n", + " \n", + " 2\n", + " T2\n", " Target\n", - " 66\n", - " 74\n", + " 72\n", + " 80\n", " effusion\n", " evidence_of_pneumonia\n", " \n", + " \n", + " 3\n", + " T3\n", + " Modifier\n", + " 43\n", + " 45\n", + " NO\n", + " definite_negated_existence\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " markup_id vis_category start end txt type\n", - "0 T0 Target 14 22 effusion evidence_of_pneumonia\n", - "1 T1 Target 66 74 effusion evidence_of_pneumonia" + " markup_id vis_category start end txt type\n", + "0 T0 Target 17 25 effusion evidence_of_pneumonia\n", + "1 T1 Modifier 0 2 NO definite_negated_existence\n", + "2 T2 Target 72 80 effusion evidence_of_pneumonia\n", + "3 T3 Modifier 43 45 NO definite_negated_existence" ] }, - "execution_count": 7, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -298,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 53, "metadata": { "scrolled": true }, @@ -333,17 +352,35 @@ " \n", " \n", " \n", + " \n", + " 0\n", + " R0\n", + " definite_negated_existence\n", + " Modifier\n", + " T1\n", + " Target\n", + " T0\n", + " \n", + " \n", + " 1\n", + " R1\n", + " definite_negated_existence\n", + " Modifier\n", + " T3\n", + " Target\n", + " T2\n", + " \n", " \n", "\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [relation_id, type, arg1_cate, arg1_id, arg2_cate, arg2_id]\n", - "Index: []" + " relation_id type arg1_cate arg1_id arg2_cate arg2_id\n", + "0 R0 definite_negated_existence Modifier T1 Target T0\n", + "1 R1 definite_negated_existence Modifier T3 Target T2" ] }, - "execution_count": 8, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -362,16 +399,21 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['pos_evidence', 'pos_evidence']" + "['pos_evidence',\n", + " 'neg_evidence',\n", + " 'pos_evidence',\n", + " 'neg_evidence',\n", + " 'pos_evidence',\n", + " 'pos_evidence']" ] }, - "execution_count": 9, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -390,7 +432,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 55, "metadata": {}, "outputs": [ { @@ -399,7 +441,7 @@ "'pneumonia_doc_yes'" ] }, - "execution_count": 10, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -419,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 56, "metadata": {}, "outputs": [ { @@ -428,7 +470,7 @@ "'pneumonia_doc_yes'" ] }, - "execution_count": 11, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -447,14 +489,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\t\t\t " ], diff --git a/11_NLP_ErrorAnalysis2.ipynb b/11_NLP_ErrorAnalysis2.ipynb index 514645f..64a4357 100644 --- a/11_NLP_ErrorAnalysis2.ipynb +++ b/11_NLP_ErrorAnalysis2.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 281, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 357, "metadata": {}, "outputs": [], "source": [ @@ -99,21 +99,32 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 358, + "metadata": {}, + "outputs": [], + "source": [ + "#?DocumentClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 395, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Reading annotations from file : data/training_v2.zip\n", - "Opening local file : data/training_v2.zip\n" + "Reading annotations from file : data/test_v2.zip\n", + "Opening local file : data/test_v2.zip\n" ] } ], "source": [ "#Read in the training documents and annotations\n", - "annotated_doc_map = read_doc_annotations('data/training_v2.zip')\n", + "annotated_doc_map = read_doc_annotations('data/test_v2.zip')\n", + "#annotated_doc_map = read_doc_annotations('data/training_v2.zip')\n", + "\n", "\n", "#Here we initiate our DocumentClassifier directly through rule files:\n", "#Change the file names if you use different files \n", @@ -130,22 +141,22 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 396, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Precision : 0.5185185185185185\n", - "Recall : 0.8235294117647058\n", - "F1: 0.6363636363636364\n", + "Precision : 0.8125\n", + "Recall : 0.9285714285714286\n", + "F1: 0.8666666666666666\n", "\n", "Confusion Matrix : \n", "Predicted 0 1\n", "Actual \n", - "0 10 26\n", - "1 6 28\n" + "0 13 3\n", + "1 1 13\n" ] } ], @@ -163,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 397, "metadata": {}, "outputs": [ { @@ -171,9 +182,9 @@ "output_type": "stream", "text": [ "Start to evaluate against reference standards...\n", - "Precision : 0.519\n", - "Recall : 0.824\n", - "F1: 0.636\n" + "Precision : 0.812\n", + "Recall : 0.929\n", + "F1: 0.867\n" ] }, { @@ -209,13 +220,13 @@ " \n", " \n", " 1\n", - " 28\n", - " 6\n", + " 13\n", + " 1\n", " \n", " \n", " 0\n", - " 26\n", - " 10\n", + " 3\n", + " 13\n", " \n", " \n", "\n", @@ -224,8 +235,8 @@ "text/plain": [ "Predicted 1 0\n", "Actual \n", - "1 28 6\n", - "0 26 10" + "1 13 1\n", + "0 3 13" ] }, "metadata": {}, @@ -256,47 +267,22 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 398, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
document nameSnippets
subject_id_150_hadm_id_12121
es are\n", - " unremarkable.\n", + "
document nameSnippets
subject_id_4276_hadm_id_25705
the carina.\n", " \n", - " IMPRESSION: Small focal opacity in right upper lobe and right paratracheal\n", - " opacity. In the sett
CHEST PA AND LATERAL: The heart size is normal. There is an area of\n", - " increased opacity lateral to the right paratracheal stripe. In the
pacity lateral to the right paratracheal stripe. In the right\n", - " upper lobe, there is a small focal opacity. The lungs are otherwise clear.\n", - " There are no
subject_id_5472_hadm_id_11987
id SVC. \n", - " There is no apparent pneumothorax. A right IJ line, NGT, and ETT are\n", - " unchanged as are the parenchymal changes in the lungs compared to the earlier\n", - " chest x-ray this mor
subject_id_7027_hadm_id_33117
ossibility of free\n", - " intraperitoneal air.\n", - " 2) Left lower lobe atelectasis/consolidation.\n", - " 3) Moderate gastric distention with multiple
stinal and hilar contours are\n", - " unremarkable. There is patchy opacity at the left lower lobe representing\n", - " either atelectasis or consolidation. No definite free air is identified,\n", - " howeve
subject_id_7272_hadm_id_19098
rt failure with bilateral pleural effusions.\n", - " Collapse and/or consolidation at the bases bilaterally.\n", - "\n", - "
rall heart size is difficult to assess. There is dense retrocardiac\n", - " opacity, possibly secondary to collapse and/or consolidation in the left lower\n", - " lobe. There is also a rig
ion in the left lower\n", - " lobe. There is also a right lower lobe and middle lobe opacity consistent\n", - " with collapse and/or consolidation.\n", + " IMPRESSION:\n", " \n", - " IMPRESSION: Persistent left heart fai
subject_id_7525_hadm_id_19141
n distal superior vena cava, unchanged. There is marked\n", - " improvement of the bilateral consolidations, especially on the right. The NG\n", - " tube tip is
subject_id_9082_hadm_id_29395
tient with seizure.\n", + " 1. New bibasilar opacities, which may represent atelectasis or aspiration\n", + " pneumonia.\n", " \n", - " Low lung volumes. Bilateral basilar opacities, considerably larger at the\n", - " left base than at
hyroid.\n", - " \n", - " IMPRESSION: Lung volumes with bilateral basilar opacities.\n", - " \n", - " Question substernal thyroid enlargemen
" + " 2. Right central venous catheter with
FINDINGS: Since prior examination, has been interval development of bibasilar\n", + " opacities, may represent atelectasis or aspiration pneumonia. Right- sided\n", + " subclavian approach central ve
" ], "text/plain": [ "" @@ -323,18 +309,18 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 399, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "22a4ec421ab7440f8f8d0983ff9ccbfd", + "model_id": "7320dc0ba69040138d47d9f27bb21ae6", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "interactive(children=(IntSlider(value=0, description='i', max=25), Output()), _dom_classes=('widget-interact',…" + "interactive(children=(IntSlider(value=0, description='i', max=2), Output()), _dom_classes=('widget-interact',)…" ] }, "metadata": {}, @@ -357,15 +343,22 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 389, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "No documents to view.\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "78987c4514e4405f91796de0c87daed5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(IntSlider(value=0, description='i', max=0), Output()), _dom_classes=('widget-interact',)…" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -420,7 +413,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "977fd3fee8544ce0b25582a39cb041f3", + "model_id": "69e84d9095354af2807157c4a488abb4", "version_major": 2, "version_minor": 0 }, @@ -434,7 +427,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "712dd4de423e42f2bbb27df919dd1977", + "model_id": "ac48e5e98f0d41fea5a32afb213ca927", "version_major": 2, "version_minor": 0 }, @@ -478,7 +471,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a37f9a4ff0284ba6a1e52d41db33a43a", + "model_id": "aa29111d8ddc40c7aadfc308eac36671", "version_major": 2, "version_minor": 0 }, @@ -492,7 +485,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ae0d2e99cfac454d91b221e235c83507", + "model_id": "849a2d8fa6b74b42b2021e0f75a38be4", "version_major": 2, "version_minor": 0 }, @@ -536,7 +529,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2391d3c571c348c8a4f10dd59a823260", + "model_id": "ef7def291aff42189a8e9b393e701bf5", "version_major": 2, "version_minor": 0 }, @@ -550,7 +543,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d04b7760c6964a139de089d3d8f98cfb", + "model_id": "ed474429133f4d3f8df7b8666af5b3cb", "version_major": 2, "version_minor": 0 }, @@ -575,6 +568,13 @@ "Presenters : Dr. Wendy Chapman, Kelly Peterson, Alec Chapman, Jianlin Shi
Acknowledgement: Many thanks to Olga Patterson because part of the materials are adopted from his previous work." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, From 206982fdddd97389674210ab202faa5e7f5e52c5 Mon Sep 17 00:00:00 2001 From: gastonq Date: Wed, 17 Jul 2019 15:58:32 +0000 Subject: [PATCH 3/4] commit 1 --- KB/doc_inferences.csv | 4 ++-- KB/featurer_inferences.csv | 12 ++++++------ KB/pneumonia_modifiers.tsv | 1 + KB/pneumonia_modifiers.yml | 32 +++++++++++++++++++++++++------- KB/pneumonia_targets.yml | 20 +++++++++++++++++++- 5 files changed, 53 insertions(+), 16 deletions(-) diff --git a/KB/doc_inferences.csv b/KB/doc_inferences.csv index e0d9fba..33c0200 100644 --- a/KB/doc_inferences.csv +++ b/KB/doc_inferences.csv @@ -2,6 +2,6 @@ DocConclusion,EvidenceTypes # The rule in document inferences are processed from top to bottom. # If any one of the rules is matched, the rest rules below it will be skipped. # if the document has a EVIDENCE_OF_PNEUMONIA annotation, conclude PNEUMONIA_DOC_YES. -PNEUMONIA_DOC_YES,POS_EVIDENCE +PNEUMONIA_DOC_YES,EVIDENCE_OF_PNEUMONIA # if no above rule matched, conclude NEG_COLON_CA_DOC (default conclusion) -PNEUMONIA_DOC_NO \ No newline at end of file +PNEUMONIA_DOC_NO,NEG_EVIDENCE \ No newline at end of file diff --git a/KB/featurer_inferences.csv b/KB/featurer_inferences.csv index 8aa44a4..30dbbdb 100644 --- a/KB/featurer_inferences.csv +++ b/KB/featurer_inferences.csv @@ -1,9 +1,9 @@ ConclusionType,SourceType,ModifierValues -#if an annotation is 'EVIDENCE_OF_PNEUMONIA', only has a modifier "DEFINITE_NEGATED_EXISTENCE', change this annotation type to NEG_EVIDENCE -#NEG_EVIDENCE,EVIDENCE_OF_PNEUMONIA,PROBABLE_NEGATED_EXISTENCE -#NEG_EVIDENCE,EVIDENCE_OF_PNEUMONIA,DEFINITE_NEGATED_EXISTENCE -POS_EVIDENCE,EVIDENCE_OF_PNEUMONIA, - - +#if an annotation is 'EVIDENCE_OF_PNEUMONIA', only has a modifier "DEFINITE_NEGATED_EXISTENCE', change this annotation type to #NEG_EVIDENCE +#POS_EVIDENCE,EVIDENCE_OF_PNEUMONIA, +NEG_EVIDENCE,EVIDENCE_OF_PNEUMONIA,DEFINITE_NEGATED_EXISTENCE +NEG_EVIDENCE,EVIDENCE_OF_PNEUMONIA,PROBABLE_NEGATED_EXISTENCE +NEG_EVIDENCE,EVIDENCE_OF_PNEUMONIA,INDICATION +NEG_EVIDENCE,EVIDENCE_OF_PNEUMONIA,HISTORICAL \ No newline at end of file diff --git a/KB/pneumonia_modifiers.tsv b/KB/pneumonia_modifiers.tsv index 91d1779..b37f823 100644 --- a/KB/pneumonia_modifiers.tsv +++ b/KB/pneumonia_modifiers.tsv @@ -2,6 +2,7 @@ Lex Type Regex Direction Unnamed: 4 Unnamed: 5 Unnamed: 6 Codes can be ruled out DEFINITE_NEGATED_EXISTENCE backward cannot totally be excluded PROBABLE_NEGATED_EXISTENCE backward could be ruled out DEFINITE_NEGATED_EXISTENCE backward +No DEFINITE_NEGATED_EXISTENCE forward although CONJ terminate but CONJ terminate decrease HISTORICAL decrease|decreased|decreasing|reduction|reduced|resolved bidirectional diff --git a/KB/pneumonia_modifiers.yml b/KB/pneumonia_modifiers.yml index e9c89b7..3859a14 100644 --- a/KB/pneumonia_modifiers.yml +++ b/KB/pneumonia_modifiers.yml @@ -12,8 +12,8 @@ Type: DEFINITE_NEGATED_EXISTENCE --- Comments: '' Direction: forward -Lex: no -Regex: '\b(NO)|(no)\b' +Lex: Negative +Regex: '\bNO\b|\bno\b|\bNo\b' Type: DEFINITE_NEGATED_EXISTENCE --- Comments: '' @@ -49,7 +49,7 @@ Type: CONJ Comments: '' Direction: bidirectional Lex: decrease -Regex: decrease|decreased|decreasing|reduction|reduced|history of +Regex: decrease|decreased|decreasing|reduction|reduced|history of|improve|improvement Type: HISTORICAL --- Comments: '' @@ -59,13 +59,31 @@ Regex: risk factor(s)? Type: FUTURE --- Comments: '' -Direction: backward +Direction: forward Lex: R/O -Regex: '' -Type: INIDCATION +Regex: '\b[R]\/[O]\b' +Type: INDICATION --- Comments: '' Direction: backward Lex: REASON FOR THIS EXAMINATION Regex: '' -Type: INIDCATION \ No newline at end of file +Type: INDICATION +--- +Comments: '' +Direction: terminate +Lex: 'MEDICAL' +Regex: '\b(FINAL|REPORT|IMPRESSION|UNDERLYING MEDICAL CONDITION)\b' +Type: CONJ +--- +Comments: '' +Direction: forward +Lex: Negative +Regex: '(check|[Ee]valuate|reason)( for)?' +Type: DEFINITE_NEGATED_EXISTENCE +--- +Comments: '' +Direction: forward +Lex: atelectasis +Regex: '' +Type: DEFINITE_NEGATED_EXISTENCE \ No newline at end of file diff --git a/KB/pneumonia_targets.yml b/KB/pneumonia_targets.yml index 99c7274..5ce0aaa 100644 --- a/KB/pneumonia_targets.yml +++ b/KB/pneumonia_targets.yml @@ -1,5 +1,11 @@ Comments: '' Direction: '' +Lex: bronchogram +Regex: '\bbronchogram' +Type: EVIDENCE_OF_PNEUMONIA +--- +Comments: '' +Direction: '' Lex: consolidation Regex: '' Type: EVIDENCE_OF_PNEUMONIA @@ -7,7 +13,7 @@ Type: EVIDENCE_OF_PNEUMONIA Comments: '' Direction: '' Lex: pneumonia -Regex: \bpneumonia[s]?\b +Regex: '\bpneumonia' Type: EVIDENCE_OF_PNEUMONIA --- Comments: '' @@ -15,3 +21,15 @@ Direction: '' Lex: infiltrate Regex: '' Type: EVIDENCE_OF_PNEUMONIA +--- +Comments: '' +Direction: '' +Lex: opacity +Regex: '' +Type: EVIDENCE_OF_PNEUMONIA +--- +Comments: '' +Direction: '' +Lex: opacification +Regex: '' +Type: EVIDENCE_OF_PNEUMONIA From 80e17186c6e0d94e6ca60506ae113cae5b580fc9 Mon Sep 17 00:00:00 2001 From: gastonq Date: Thu, 18 Jul 2019 15:10:05 +0000 Subject: [PATCH 4/4] abc --- 08_NLP_Regex_for_Concept_Extraction.ipynb | 54 ++- 24_brat_workspace_setup.ipynb | 4 +- ...ulate_Agreement_for_Brat_Annotations.ipynb | 373 ++++++++++++------ 3 files changed, 301 insertions(+), 130 deletions(-) diff --git a/08_NLP_Regex_for_Concept_Extraction.ipynb b/08_NLP_Regex_for_Concept_Extraction.ipynb index 63572fd..769e090 100644 --- a/08_NLP_Regex_for_Concept_Extraction.ipynb +++ b/08_NLP_Regex_for_Concept_Extraction.ipynb @@ -993,9 +993,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "txt=\"cardiovascular: patient has cardiovascular\"\n", "re.search('cardiovascular$', txt)" @@ -1012,10 +1023,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "match: cardiovascular\n", + "span: 0 14\n" + ] + } + ], "source": [ + "import re\n", "matched=re.match('^cardiovascular', txt)\n", "print(matched)\n", "print(\"match:\", matched.group())\n", @@ -1058,9 +1080,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "\n", + "retrieved in as a tuple/ full span\n", + "all parts: ('6', '15', '2015') = admission date:6/15-2015 \n", + "\n", + "retrieved as a dictionary\n", + "{'month': '6', 'day': '15', 'year': '2015'} \n", + "\n", + "retrieved in parts\n", + "month: 6 = 6\n", + "day: 15 = 15\n", + "year: 2015 = 2015\n", + "{'month': '6', 'day': '15', 'year': '2015'}\n" + ] + } + ], "source": [ "txt=\"admission date:6/15-2015.\"\n", "fullSpan=re.match(r\"admission date:\\s*(?P\\d{1,2})[-|\\/](?P\\d{1,2})-(?P\\d{2,4})\", txt)\n", diff --git a/24_brat_workspace_setup.ipynb b/24_brat_workspace_setup.ipynb index 0276ad1..1c0c1d3 100644 --- a/24_brat_workspace_setup.ipynb +++ b/24_brat_workspace_setup.ipynb @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ diff --git a/27_Calculate_Agreement_for_Brat_Annotations.ipynb b/27_Calculate_Agreement_for_Brat_Annotations.ipynb index 99de365..ba0d538 100644 --- a/27_Calculate_Agreement_for_Brat_Annotations.ipynb +++ b/27_Calculate_Agreement_for_Brat_Annotations.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -25,7 +25,7 @@ "Requirement already satisfied: sortedcontainers<3.0,>=2.0 in /opt/conda/lib/python3.7/site-packages (from intervaltree) (2.1.0)\n", "Building wheels for collected packages: intervaltree\n", " Building wheel for intervaltree (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Stored in directory: /home/jianlins/.cache/pip/wheels/08/99/c0/5a5942f5b9567c59c14aac76f95a70bf11dccc71240b91ebf5\n", + "\u001b[?25h Stored in directory: /home/gastonq/.cache/pip/wheels/08/99/c0/5a5942f5b9567c59c14aac76f95a70bf11dccc71240b91ebf5\n", "Successfully built intervaltree\n", "Installing collected packages: intervaltree\n", "Successfully installed intervaltree-3.0.2\n" @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -123,74 +123,17 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "EVIDENCE_OF_PNEUMONIA\n", - "0 0 14 None\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
B+B-
A+00.0
A-14NaN
\n", - "
" - ], - "text/plain": [ - " B+ B-\n", - "A+ 0 0.0\n", - "A- 14 NaN" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ "PNEUMONIA_DOC_NO\n", - "0 0 4 None\n" + "4 2 0 None\n" ] }, { @@ -221,12 +164,12 @@ " \n", " \n", " A+\n", - " 0\n", - " 0.0\n", + " 4\n", + " 2.0\n", " \n", " \n", " A-\n", - " 4\n", + " 0\n", " NaN\n", " \n", " \n", @@ -235,8 +178,8 @@ ], "text/plain": [ " B+ B-\n", - "A+ 0 0.0\n", - "A- 4 NaN" + "A+ 4 2.0\n", + "A- 0 NaN" ] }, "metadata": {}, @@ -247,7 +190,7 @@ "output_type": "stream", "text": [ "PNEUMONIA_DOC_YES\n", - "0 0 6 None\n" + "4 0 7 None\n" ] }, { @@ -278,12 +221,12 @@ " \n", " \n", " A+\n", - " 0\n", + " 4\n", " 0.0\n", " \n", " \n", " A-\n", - " 6\n", + " 7\n", " NaN\n", " \n", " \n", @@ -292,8 +235,8 @@ ], "text/plain": [ " B+ B-\n", - "A+ 0 0.0\n", - "A- 6 NaN" + "A+ 4 0.0\n", + "A- 7 NaN" ] }, "metadata": {}, @@ -336,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -346,15 +289,72 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CONSOLIDATION\n", + "1 3 1 None\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
B+B-
A+13.0
A-1NaN
\n", + "
" + ], + "text/plain": [ + " B+ B-\n", + "A+ 1 3.0\n", + "A- 1 NaN" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", "text": [ "EVIDENCE_OF_PNEUMONIA\n", - "0 0 14 None\n" + "0 0 13 None\n" ] }, { @@ -390,7 +390,7 @@ " \n", " \n", " A-\n", - " 14\n", + " 13\n", " NaN\n", " \n", " \n", @@ -400,7 +400,7 @@ "text/plain": [ " B+ B-\n", "A+ 0 0.0\n", - "A- 14 NaN" + "A- 13 NaN" ] }, "metadata": {}, @@ -410,8 +410,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "PNEUMONIA_DOC_NO\n", - "0 0 4 None\n" + "LOCAL_INFILTRATE\n", + "0 1 1 None\n" ] }, { @@ -443,11 +443,125 @@ " \n", " A+\n", " 0\n", - " 0.0\n", + " 1.0\n", + " \n", + " \n", + " A-\n", + " 1\n", + " NaN\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " B+ B-\n", + "A+ 0 1.0\n", + "A- 1 NaN" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PNEUMONIA\n", + "0 9 0 None\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + "
B+B-
A+09.0
A-0NaN
\n", + "
" + ], + "text/plain": [ + " B+ B-\n", + "A+ 0 9.0\n", + "A- 0 NaN" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PNEUMONIA_DOC_NO\n", + "4 2 0 None\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -456,8 +570,8 @@ ], "text/plain": [ " B+ B-\n", - "A+ 0 0.0\n", - "A- 4 NaN" + "A+ 4 2.0\n", + "A- 0 NaN" ] }, "metadata": {}, @@ -468,7 +582,7 @@ "output_type": "stream", "text": [ "PNEUMONIA_DOC_YES\n", - "0 0 6 None\n" + "4 0 7 None\n" ] }, { @@ -499,12 +613,12 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -513,8 +627,8 @@ ], "text/plain": [ " B+ B-\n", - "A+ 0 0.0\n", - "A- 6 NaN" + "A+ 4 0.0\n", + "A- 7 NaN" ] }, "metadata": {}, @@ -547,7 +661,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -556,11 +670,26 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.36363636363636365\n" + ] + } + ], "source": [ - "# your code goes here:\n" + "# your code goes here: observed agreement\n", + "d = 11-(a+b+c)\n", + "#a b\n", + "#c d\n", + "\n", + "obs = (a+d)/(a+b+d+c)\n", + "print(obs)\n", + "\n" ] }, { @@ -581,7 +710,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -589,10 +718,29 @@ "output_type": "stream", "text": [ "PNEUMONIA_DOC_NO\n", - "(0, 0, 4, None)\n", - "\tNo documents to display.\n", + "(4, 2, 0, None)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6a12da18b6ff4464ac4434d0bf5c2ffa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='
B+B-
A+42.0
A-0NaN
A+040.0
A-67NaN