Skip to content
This repository has been archived by the owner on Sep 3, 2023. It is now read-only.

Commit

Permalink
Added dataset description and reorganised them
Browse files Browse the repository at this point in the history
  • Loading branch information
malteserteresa committed Aug 25, 2019
1 parent 6fb50d3 commit 4391c3f
Show file tree
Hide file tree
Showing 10 changed files with 491 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ docs/
.Rproj.user
data/interim/sample_metoo_tweets.csv
data/raw/metoo/
data/external/rapeglish/
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,11 @@ python -m pytest tests/test_nn_dataturks.py
```

NB. this is not a permanent solution but will enable initial effective collaboration. If you have any thoughts or ideas on how to improve this, just email [email protected]

Project Datasets
--------------------

aws_annotated - our annotations + hatespeech
dataturks - obtained from dataturks crowdsource labeling
hatespeech - obtained from Zeerak Waseem
rapeglish - scraped from random rape threat generator by Emma Jane
File renamed without changes.
File renamed without changes.
File renamed without changes.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(8484,)\n",
"(8484,)\n"
]
},
{
"ename": "AttributeError",
"evalue": "'list' object has no attribute 'values'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-2-bf65157ed7ac>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 29\u001b[0m ]\n\u001b[1;32m 30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0manalysis_of_weak_labeling\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabeling_functions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabeling_function_names\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/coding_projects/python/opt_out/find-out/src/evaluation/hatespeech/evaluation_rulesbased_hatespeech.py\u001b[0m in \u001b[0;36manalysis_of_weak_labeling\u001b[0;34m(data, true_labels, labeling_functions, labeling_function_names)\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0mlabeling_function_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_Ls_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabeling_functions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0mtrue_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrue_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 36\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlf_summary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msparse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcsr_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabeling_function_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrue_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlf_names\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlabeling_function_names\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 37\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'values'"
]
}
],
"source": [
"import pandas as pd\n",
"from torch.utils.tensorboard import SummaryWriter\n",
"from src.evaluation.hatespeech.evaluation_rulesbased_hatespeech import analysis_of_weak_labeling\n",
"SummaryWriter()\n",
"\n",
"from src.features.hatespeech.featureeng_rulesbased_hatespeech import contains_dick_or_synonym, contains_slut_or_synonyms\n",
"from src.utils.normalize import normalize\n",
"\n",
"data = pd.read_csv(\"../../data/external/hatespeech/hs_data.csv\")\n",
"data['normalized'] = data['text'].apply(lambda comment: normalize(comment))\n",
"\n",
"# Generate vectors\n",
"X = data['normalized']\n",
"print(X.shape)\n",
"\n",
"# True labels\n",
"Y = pd.get_dummies(data['annotation'])['misogynistic']\n",
"print(Y.shape)\n",
"# Create noisy labels\n",
"# data['contains_dick_or_synonym'] = data['text'].apply(lambda tweet: contains_dick_or_synonym(tweet))\n",
"# data['contains_slut_or_synonym'] = data['text'].apply(lambda tweet: contains_slut_or_synonyms(tweet))\n",
"# L = data[['contains_dick_or_synonym', 'contains_slut_or_synonym']]\n",
"\n",
"labeling_functions = [\n",
" contains_dick_or_synonym,\n",
"]\n",
"labeling_function_names = [\n",
" \"genitalia_reference\",\n",
"]\n",
"\n",
"analysis_of_weak_labeling(data, Y, labeling_functions, labeling_function_names)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "PyCharm (find-out)",
"language": "python",
"name": "pycharm-66275036"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Binary file not shown.

0 comments on commit 4391c3f

Please sign in to comment.