Added dataset description and reorganised them

opt-out-tools · Aug 25, 2019 · 4391c3f · 4391c3f
1 parent 6fb50d3
commit 4391c3f
Show file tree

Hide file tree

Showing 10 changed files with 491 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@ docs/
 .Rproj.user
 data/interim/sample_metoo_tweets.csv
 data/raw/metoo/
+data/external/rapeglish/
diff --git a/README.md b/README.md
@@ -75,3 +75,11 @@ python -m pytest tests/test_nn_dataturks.py
 ```
 
 NB. this is not a permanent solution but will enable initial effective collaboration. If you have any thoughts or ideas on how to improve this, just email [email protected]
+
+Project Datasets
+--------------------
+
+aws_annotated - our annotations + hatespeech
+dataturks - obtained from dataturks crowdsource labeling
+hatespeech - obtained from Zeerak Waseem
+rapeglish - scraped from random rape threat generator by Emma Jane
diff --git a/data/external/zeerack/amateur_expert.json → data/raw/hatespeech/amateur_expert.json b/data/external/zeerack/amateur_expert.json → data/raw/hatespeech/amateur_expert.json
diff --git a/...al/zeerack/amateur_expert_annotations.csv → ...hatespeech/amateur_expert_annotations.csv b/...al/zeerack/amateur_expert_annotations.csv → ...hatespeech/amateur_expert_annotations.csv
diff --git a/data/external/zeerack/neither.json → data/raw/hatespeech/neither.json b/data/external/zeerack/neither.json → data/raw/hatespeech/neither.json
diff --git a/data/external/zeerack/racism.json → data/raw/hatespeech/racism.json b/data/external/zeerack/racism.json → data/raw/hatespeech/racism.json
diff --git a/data/external/zeerack/sexism.json → data/raw/hatespeech/sexism.json b/data/external/zeerack/sexism.json → data/raw/hatespeech/sexism.json
diff --git a/notebooks/hatespeech/.ipynb_checkpoints/eda_hatespeech-checkpoint.ipynb b/notebooks/hatespeech/.ipynb_checkpoints/eda_hatespeech-checkpoint.ipynb
diff --git a/notebooks/hatespeech/.ipynb_checkpoints/rulesbased_hatespeech-checkpoint.ipynb b/notebooks/hatespeech/.ipynb_checkpoints/rulesbased_hatespeech-checkpoint.ipynb
@@ -0,0 +1,105 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(8484,)\n",
+      "(8484,)\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'list' object has no attribute 'values'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-2-bf65157ed7ac>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     29\u001b[0m ]\n\u001b[1;32m     30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0manalysis_of_weak_labeling\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabeling_functions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabeling_function_names\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m~/coding_projects/python/opt_out/find-out/src/evaluation/hatespeech/evaluation_rulesbased_hatespeech.py\u001b[0m in \u001b[0;36manalysis_of_weak_labeling\u001b[0;34m(data, true_labels, labeling_functions, labeling_function_names)\u001b[0m\n\u001b[1;32m     34\u001b[0m     \u001b[0mlabeling_function_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_Ls_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabeling_functions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     35\u001b[0m     \u001b[0mtrue_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrue_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 36\u001b[0;31m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlf_summary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msparse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcsr_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabeling_function_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrue_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlf_names\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlabeling_function_names\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     37\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'values'"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from torch.utils.tensorboard import SummaryWriter\n",
+    "from src.evaluation.hatespeech.evaluation_rulesbased_hatespeech import analysis_of_weak_labeling\n",
+    "SummaryWriter()\n",
+    "\n",
+    "from src.features.hatespeech.featureeng_rulesbased_hatespeech import contains_dick_or_synonym, contains_slut_or_synonyms\n",
+    "from src.utils.normalize import normalize\n",
+    "\n",
+    "data = pd.read_csv(\"../../data/external/hatespeech/hs_data.csv\")\n",
+    "data['normalized'] = data['text'].apply(lambda comment: normalize(comment))\n",
+    "\n",
+    "# Generate vectors\n",
+    "X = data['normalized']\n",
+    "print(X.shape)\n",
+    "\n",
+    "# True labels\n",
+    "Y = pd.get_dummies(data['annotation'])['misogynistic']\n",
+    "print(Y.shape)\n",
+    "# Create noisy labels\n",
+    "# data['contains_dick_or_synonym'] = data['text'].apply(lambda tweet: contains_dick_or_synonym(tweet))\n",
+    "# data['contains_slut_or_synonym'] = data['text'].apply(lambda tweet: contains_slut_or_synonyms(tweet))\n",
+    "# L = data[['contains_dick_or_synonym', 'contains_slut_or_synonym']]\n",
+    "\n",
+    "labeling_functions = [\n",
+    "    contains_dick_or_synonym,\n",
+    "]\n",
+    "labeling_function_names = [\n",
+    "    \"genitalia_reference\",\n",
+    "]\n",
+    "\n",
+    "analysis_of_weak_labeling(data, Y, labeling_functions, labeling_function_names)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PyCharm (find-out)",
+   "language": "python",
+   "name": "pycharm-66275036"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "metadata": {
+     "collapsed": false
+    },
+    "source": []
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/runs/Aug20_16-50-55_tbag/events.out.tfevents.1566312657.tbag.8152.0 b/runs/Aug20_16-50-55_tbag/events.out.tfevents.1566312657.tbag.8152.0