diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..4b33fe18 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,52 @@ +name: Deploy docs +on: + workflow_dispatch: + push: + branches: + - 'master' + pull_request: +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Configure Git Credentials + run: | + git config user.name github-actions[bot] + git config user.email 41898282+github-actions[bot]@users.noreply.github.com + if: (github.event_name != 'pull_request') + + - name: Set up Python 3.9 + uses: actions/setup-python@v5 + with: + python-version: '3.9' + cache: 'pip' + cache-dependency-path: | + setup.py + requirements-docs.txt + + - name: Save time for cache for mkdocs + run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + + - name: Caching + uses: actions/cache@v4 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + + - name: Install Dependencies + run: pip install -r requirements-docs.txt + + - name: Deploy to GitHub Pages + run: mkdocs gh-deploy --force + if: (github.event_name != 'pull_request') + + - name: Build docs to check for errors + run: mkdocs build + if: (github.event_name == 'pull_request') diff --git a/g3doc/__init__.py b/docs/__init__.py similarity index 100% rename from g3doc/__init__.py rename to docs/__init__.py diff --git a/g3doc/guide/_index.yaml b/docs/guide/_index.yaml similarity index 100% rename from g3doc/guide/_index.yaml rename to docs/guide/_index.yaml diff --git a/g3doc/guide/_toc.yaml b/docs/guide/_toc.yaml similarity index 100% rename from g3doc/guide/_toc.yaml rename to docs/guide/_toc.yaml diff --git a/g3doc/guide/guidance.md b/docs/guide/guidance.md similarity index 98% rename from g3doc/guide/guidance.md rename to docs/guide/guidance.md index 4ce06879..fe5d85c5 100644 --- a/g3doc/guide/guidance.md +++ b/docs/guide/guidance.md @@ -21,7 +21,7 @@ sociolinguists, and cultural anthropologists, as well as with members of the populations on which technology will be deployed. A single model, for example, the toxicity model that we leverage in the -[example colab](https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Fairness_Indicators_Example_Colab), +[example colab](../../tutorials/Fairness_Indicators_Example_Colab), can be used in many different contexts. A toxicity model deployed on a website to filter offensive comments, for example, is a very different use case than the model being deployed in an example web UI where users can type in a sentence and @@ -315,7 +315,7 @@ Prediction Flip Count** * **Don’t see the metrics you’re looking for?** Follow the documentation -[here](https://github.com/tensorflow/model-analysis/blob/master/g3doc/post_export_metrics.md) +[here](https://tensorflow.github.io/model-analysis/post_export_metrics/) to add you own custom metric. ## Final notes diff --git a/g3doc/images/facedetection.png b/docs/images/facedetection.png similarity index 100% rename from g3doc/images/facedetection.png rename to docs/images/facedetection.png diff --git a/g3doc/images/fairnessIndicators.png b/docs/images/fairnessIndicators.png similarity index 100% rename from g3doc/images/fairnessIndicators.png rename to docs/images/fairnessIndicators.png diff --git a/g3doc/images/googleai.png b/docs/images/googleai.png similarity index 100% rename from g3doc/images/googleai.png rename to docs/images/googleai.png diff --git a/g3doc/images/mlpracticum.png b/docs/images/mlpracticum.png similarity index 100% rename from g3doc/images/mlpracticum.png rename to docs/images/mlpracticum.png diff --git a/g3doc/images/tensorboard.png b/docs/images/tensorboard.png similarity index 100% rename from g3doc/images/tensorboard.png rename to docs/images/tensorboard.png diff --git a/docs/images/tf_full_color_primary_icon.svg b/docs/images/tf_full_color_primary_icon.svg new file mode 100644 index 00000000..3e724777 --- /dev/null +++ b/docs/images/tf_full_color_primary_icon.svg @@ -0,0 +1 @@ +FullColorPrimary Icon \ No newline at end of file diff --git a/g3doc/images/tfhub.png b/docs/images/tfhub.png similarity index 100% rename from g3doc/images/tfhub.png rename to docs/images/tfhub.png diff --git a/g3doc/images/toxicity_detection.png b/docs/images/toxicity_detection.png similarity index 100% rename from g3doc/images/toxicity_detection.png rename to docs/images/toxicity_detection.png diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..78e960f0 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,94 @@ +# Fairness Indicators + +/// html | div[style='float: left; width: 50%;'] +Fairness Indicators is a library that enables easy computation of commonly-identified fairness metrics for binary and multiclass classifiers. With the Fairness Indicators tool suite, you can: + +- Compute commonly-identified fairness metrics for classification models +- Compare model performance across subgroups to a baseline, or to other models +- Use confidence intervals to surface statistically significant disparities +- Perform evaluation over multiple thresholds + +Use Fairness Indicators via the: + +- [Evaluator component](https://tensorflow.github.io/tfx/guide/evaluator/) in a [TFX pipeline](https://tensorflow.github.io/tfx/) +- [TensorBoard plugin](https://github.com/tensorflow/tensorboard/blob/master/docs/fairness-indicators.md) +- [TensorFlow Model Analysis library](https://tensorflow.github.io/tfx/guide/fairness_indicators/) +- [Model Agnostic TFMA library](https://tensorflow.github.io/tfx/guide/fairness_indicators/#using-fairness-indicators-with-non-tensorflow-models) + +/// + +/// html | div[style='float: right;width: 50%;'] +```python +eval_config_pbtxt = """ + +model_specs { + label_key: "%s" +} + +metrics_specs { + metrics { + class_name: "FairnessIndicators" + config: '{ "thresholds": [0.25, 0.5, 0.75] }' + } + metrics { + class_name: "ExampleCount" + } +} + +slicing_specs {} +slicing_specs { + feature_keys: "%s" +} + +options { + compute_confidence_intervals { value: False } + disabled_outputs{values: "analysis"} +} +""" % (LABEL_KEY, GROUP_KEY) +``` +/// + +/// html | div[style='clear: both;'] +/// + +
+ +- ![ML Practicum: Fairness in Perspective API using Fairness Indicators](https://www.tensorflow.org/static/responsible_ai/fairness_indicators/images/mlpracticum_480.png) + + ### [ML Practicum: Fairness in Perspective API using Fairness Indicators](https://developers.google.com/machine-learning/practica/fairness-indicators?utm_source=github&utm_medium=github&utm_campaign=fi-practicum&utm_term=&utm_content=repo-body) + + --- + + [Try the Case Study](https://developers.google.com/machine-learning/practica/fairness-indicators?utm_source=github&utm_medium=github&utm_campaign=fi-practicum&utm_term=&utm_content=repo-body) + +- ![Fairness Indicators on the TensorFlow blog](images/tf_full_color_primary_icon.svg) + + ### [Fairness Indicators on the TensorFlow blog](https://blog.tensorflow.org/2019/12/fairness-indicators-fair-ML-systems.html) + + --- + + [Read on the TensorFlow blog](https://blog.tensorflow.org/2019/12/fairness-indicators-fair-ML-systems.html) + +- ![Fairness Indicators on GitHub](https://www.tensorflow.org/static/resources/images/github-card-16x9_480.png) + + ### [Fairness Indicators on GitHub](https://github.com/tensorflow/fairness-indicators) + --- + + [View on GitHub](https://github.com/tensorflow/fairness-indicators) + +- ![Fairness Indicators on the Google AI Blog](https://www.tensorflow.org/static/responsible_ai/fairness_indicators/images/googleai_720.png) + + ### [Fairness Indicators on the Google AI Blog](https://ai.googleblog.com/2019/12/fairness-indicators-scalable.html) + --- + + [Read on Google AI blog](https://ai.googleblog.com/2019/12/fairness-indicators-scalable.html) + +- + + ### [Fairness Indicators at Google I/O](https://www.youtube.com/watch?v=6CwzDoE8J4M) + + --- + + [Watch the video](https://www.youtube.com/watch?v=6CwzDoE8J4M) + +
diff --git a/docs/javascripts/mathjax.js b/docs/javascripts/mathjax.js new file mode 100644 index 00000000..0be88e04 --- /dev/null +++ b/docs/javascripts/mathjax.js @@ -0,0 +1,19 @@ +window.MathJax = { + tex: { + inlineMath: [["\\(", "\\)"]], + displayMath: [["\\[", "\\]"]], + processEscapes: true, + processEnvironments: true + }, + options: { + ignoreHtmlClass: ".*|", + processHtmlClass: "arithmatex" + } +}; + +document$.subscribe(() => { + MathJax.startup.output.clearCache() + MathJax.typesetClear() + MathJax.texReset() + MathJax.typesetPromise() +}) diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 00000000..21c97aa9 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,42 @@ +:root { + --md-primary-fg-color: #FFA800; + --md-primary-fg-color--light: #CCCCCC; + --md-primary-fg-color--dark: #425066; +} + +.video-wrapper { + max-width: 240px; + display: flex; + flex-direction: row; +} +.video-wrapper > iframe { + width: 100%; + aspect-ratio: 16 / 9; +} + +.buttons-wrapper { + flex-wrap: wrap; + gap: 1em; + display: flex; + /* flex-grow: 1; */ + /* justify-content: center; */ + /* align-content: center; */ +} + +.buttons-wrapper > a { + justify-content: center; + align-content: center; + flex-wrap: nowrap; + /* gap: 1em; */ + align-items: center; + text-align: center; + flex: 1 1 30%; + display: flex; +} + +.md-button > .buttons-content { + align-items: center; + justify-content: center; + display: flex; + gap: 1em; +} diff --git a/docs/tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb b/docs/tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb new file mode 100644 index 00000000..82b71085 --- /dev/null +++ b/docs/tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb @@ -0,0 +1,441 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Sxt-9qpNgPxo" + }, + "source": [ + "##### Copyright 2020 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Phnw6c3-gQ1f" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aalPefrUUplk" + }, + "source": [ + "# FaceSSD Fairness Indicators Example Colab" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KFRBcGOYgEAI" + }, + "source": [ + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UZ48WFLwbCL6" + }, + "source": [ + "##Overview\n", + "\n", + "In this activity, you'll use [Fairness Indicators](https://tensorflow.github.io/fairness-indicators) to explore the [FaceSSD predictions on Labeled Faces in the Wild dataset](https://modelcards.withgoogle.com/face-detection). Fairness Indicators is a suite of tools built on top of [TensorFlow Model Analysis](https://tensorflow.github.io/model-analysis/get_started) that enable regular evaluation of fairness metrics in product pipelines.\n", + "\n", + "##About the Dataset\n", + "\n", + "In this exercise, you'll work with the FaceSSD prediction dataset, approximately 200k different image predictions and groundtruths generated by FaceSSD API.\n", + "\n", + "##About the Tools\n", + "\n", + "[TensorFlow Model Analysis](https://tensorflow.github.io/model_analysis/get_started) is a library for evaluating both TensorFlow and non-TensorFlow machine learning models. It allows users to evaluate their models on large amounts of data in a distributed manner, computing in-graph and other metrics over different slices of data and visualize in notebooks.\n", + "\n", + "[TensorFlow Data Validation](https://tensorflow.github.io/data-validation/get_started) is one tool you can use to analyze your data. You can use it to find potential problems in your data, such as missing values and data imbalances, that can lead to Fairness disparities.\n", + "\n", + "With [Fairness Indicators](https://tensorflow.github.io/fairness-indicators/), users will be able to: \n", + "\n", + "* Evaluate model performance, sliced across defined groups of users\n", + "* Feel confident about results with confidence intervals and evaluations at multiple thresholds" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u33JXdluZ2lG" + }, + "source": [ + "# Importing\n", + "\n", + "Run the following code to install the fairness_indicators library. This package contains the tools we'll be using in this exercise. Restart Runtime may be requested but is not necessary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EoRNffG599XP" + }, + "outputs": [], + "source": [ + "!pip install apache_beam\n", + "!pip install fairness-indicators\n", + "!pip install witwidget\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B8dlyTyiTe-9" + }, + "outputs": [], + "source": [ + "import os\n", + "import tempfile\n", + "import apache_beam as beam\n", + "import numpy as np\n", + "import pandas as pd\n", + "from datetime import datetime\n", + "\n", + "import tensorflow_hub as hub\n", + "import tensorflow as tf\n", + "import tensorflow_model_analysis as tfma\n", + "import tensorflow_data_validation as tfdv\n", + "from tensorflow_model_analysis.addons.fairness.post_export_metrics import fairness_indicators\n", + "from tensorflow_model_analysis.addons.fairness.view import widget_view\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_predict as agnostic_predict\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_evaluate_graph\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_extractor\n", + "\n", + "from witwidget.notebook.visualization import WitConfigBuilder\n", + "from witwidget.notebook.visualization import WitWidget" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TsplOJGqWCf5" + }, + "source": [ + "# Download and Understand the Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vFOQ4AaIcAn2" + }, + "source": [ + "[Labeled Faces in the Wild](http://vis-www.cs.umass.edu/lfw/) is a public benchmark dataset for face verification, also known as pair matching. LFW contains more than 13,000 images of faces collected from the web.\n", + "\n", + "We ran FaceSSD predictions on this dataset to predict whether a face is present in a given image. In this Colab, we will slice data according to gender to observe if there are any significant differences between model performance for different gender groups.\n", + "\n", + "If there is more than one face in an image, gender is labeled as \"MISSING\".\n", + "\n", + "We've hosted the dataset on Google Cloud Platform for convenience. Run the following code to download the data from GCP, the data will take about a minute to download and analyze." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NdLBi6tN5i7I" + }, + "outputs": [], + "source": [ + "data_location = tf.keras.utils.get_file('lfw_dataset.tf', 'https://storage.googleapis.com/facessd_dataset/lfw_dataset.tfrecord')\n", + "\n", + "stats = tfdv.generate_statistics_from_tfrecord(data_location=data_location)\n", + "tfdv.visualize_statistics(stats)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cNODEwE5x7Uo" + }, + "source": [ + "# Defining Constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZF4NO87uFxdQ" + }, + "outputs": [], + "source": [ + "BASE_DIR = tempfile.gettempdir()\n", + "\n", + "tfma_eval_result_path = os.path.join(BASE_DIR, 'tfma_eval_result')\n", + "\n", + "compute_confidence_intervals = True\n", + "\n", + "slice_key = 'object/groundtruth/Gender'\n", + "label_key = 'object/groundtruth/face'\n", + "prediction_key = 'object/prediction/face'\n", + "\n", + "feature_map = {\n", + " slice_key:\n", + " tf.io.FixedLenFeature([], tf.string, default_value=['none']),\n", + " label_key:\n", + " tf.io.FixedLenFeature([], tf.float32, default_value=[0.0]),\n", + " prediction_key:\n", + " tf.io.FixedLenFeature([], tf.float32, default_value=[0.0]),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gVLHwuhEyI8R" + }, + "source": [ + "# Model Agnostic Config for TFMA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ej1nGCZSyJIK" + }, + "outputs": [], + "source": [ + "model_agnostic_config = agnostic_predict.ModelAgnosticConfig(\n", + " label_keys=[label_key],\n", + " prediction_keys=[prediction_key],\n", + " feature_spec=feature_map)\n", + "\n", + "model_agnostic_extractors = [\n", + " model_agnostic_extractor.ModelAgnosticExtractor(\n", + " model_agnostic_config=model_agnostic_config, desired_batch_size=3),\n", + " tfma.extractors.slice_key_extractor.SliceKeyExtractor(\n", + " [tfma.slicer.SingleSliceSpec(),\n", + " tfma.slicer.SingleSliceSpec(columns=[slice_key])])\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wqkk9SkvyVkR" + }, + "source": [ + "# Fairness Callbacks and Computing Fairness Metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "A0icrlliBCOb" + }, + "outputs": [], + "source": [ + "# Helper class for counting examples in beam PCollection\n", + "class CountExamples(beam.CombineFn):\n", + " def __init__(self, message):\n", + " self.message = message\n", + "\n", + " def create_accumulator(self):\n", + " return 0\n", + "\n", + " def add_input(self, current_sum, element):\n", + " return current_sum + 1\n", + "\n", + " def merge_accumulators(self, accumulators): \n", + " return sum(accumulators)\n", + "\n", + " def extract_output(self, final_sum):\n", + " if final_sum:\n", + " print(\"%s: %d\"%(self.message, final_sum))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mRQjdjp9yVv2" + }, + "outputs": [], + "source": [ + "metrics_callbacks = [\n", + " tfma.post_export_metrics.fairness_indicators(\n", + " thresholds=[0.1, 0.3, 0.5, 0.7, 0.9],\n", + " labels_key=label_key,\n", + " target_prediction_keys=[prediction_key]),\n", + " tfma.post_export_metrics.auc(\n", + " curve='PR',\n", + " labels_key=label_key,\n", + " target_prediction_keys=[prediction_key]),\n", + "]\n", + "\n", + "eval_shared_model = tfma.types.EvalSharedModel(\n", + " add_metrics_callbacks=metrics_callbacks,\n", + " construct_fn=model_agnostic_evaluate_graph.make_construct_fn(\n", + " add_metrics_callbacks=metrics_callbacks,\n", + " config=model_agnostic_config))\n", + "\n", + "with beam.Pipeline() as pipeline:\n", + " # Read data.\n", + " data = (\n", + " pipeline\n", + " | 'ReadData' >> beam.io.ReadFromTFRecord(data_location))\n", + "\n", + " # Count all examples.\n", + " data_count = (\n", + " data | 'Count number of examples' >> beam.CombineGlobally(\n", + " CountExamples('Before filtering \"Gender:MISSING\"')))\n", + "\n", + " # If there are more than one face in image, the gender feature is 'MISSING'\n", + " # and we are filtering that image out.\n", + " def filter_missing_gender(element):\n", + " example = tf.train.Example.FromString(element)\n", + " if example.features.feature[slice_key].bytes_list.value[0] != b'MISSING':\n", + " yield element\n", + "\n", + " filtered_data = (\n", + " data\n", + " | 'Filter Missing Gender' >> beam.ParDo(filter_missing_gender))\n", + "\n", + " # Count after filtering \"Gender:MISSING\".\n", + " filtered_data_count = (\n", + " filtered_data | 'Count number of examples after filtering'\n", + " >> beam.CombineGlobally(\n", + " CountExamples('After filtering \"Gender:MISSING\"')))\n", + "\n", + " # Because LFW data set has always faces by default, we are adding\n", + " # labels as 1.0 for all images.\n", + " def add_face_groundtruth(element):\n", + " example = tf.train.Example.FromString(element)\n", + " example.features.feature[label_key].float_list.value[:] = [1.0]\n", + " yield example.SerializeToString()\n", + "\n", + " final_data = (\n", + " filtered_data\n", + " | 'Add Face Groundtruth' >> beam.ParDo(add_face_groundtruth))\n", + "\n", + " # Run TFMA.\n", + " _ = (\n", + " final_data\n", + " | 'ExtractEvaluateAndWriteResults' >>\n", + " tfma.ExtractEvaluateAndWriteResults(\n", + " eval_shared_model=eval_shared_model,\n", + " compute_confidence_intervals=compute_confidence_intervals,\n", + " output_path=tfma_eval_result_path,\n", + " extractors=model_agnostic_extractors))\n", + "\n", + "eval_result = tfma.load_eval_result(output_path=tfma_eval_result_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktlASJQIzE3l" + }, + "source": [ + "# Render Fairness Indicators\n", + "\n", + "Render the Fairness Indicators widget with the exported evaluation results.\n", + "\n", + "Below you will see bar charts displaying performance of each slice of the data on selected metrics. You can adjust the baseline comparison slice as well as the displayed threshold(s) using the drop down menus at the top of the visualization.\n", + "\n", + "A relevant metric for this use case is true positive rate, also known as recall. Use the selector on the left hand side to choose the graph for true_positive_rate. These metric values match the values displayed on the [model card](https://modelcards.withgoogle.com/face-detection).\n", + "\n", + "For some photos, gender is labeled as young instead of male or female, if the person in the photo is too young to be accurately annotated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JNaNhTCTAMHm" + }, + "outputs": [], + "source": [ + "widget_view.render_fairness_indicator(eval_result=eval_result,\n", + " slicing_column=slice_key)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "Sxt-9qpNgPxo" + ], + "name": "Facessd Fairness Indicators Example Colab.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/Fairness_Indicators_Example_Colab.ipynb b/docs/tutorials/Fairness_Indicators_Example_Colab.ipynb new file mode 100644 index 00000000..b87654cd --- /dev/null +++ b/docs/tutorials/Fairness_Indicators_Example_Colab.ipynb @@ -0,0 +1,740 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Tce3stUlHN0L" + }, + "source": [ + "##### Copyright 2020 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tuOe1ymfHZPu" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aalPefrUUplk" + }, + "source": [ + "# Introduction to Fairness Indicators" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MfBg1C5NB3X0" + }, + "source": [ + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YWcPbUNg1yez" + }, + "source": [ + "## Overview\n", + "\n", + "Fairness Indicators is a suite of tools built on top of [TensorFlow Model Analysis (TFMA)](https://tensorflow.github.io/model-analysis/get_started) that enable regular evaluation of fairness metrics in product pipelines. TFMA is a library for evaluating both TensorFlow and non-TensorFlow machine learning models. It allows you to evaluate your models on large amounts of data in a distributed manner, compute in-graph and other metrics over different slices of data, and visualize them in notebooks. \n", + "\n", + "Fairness Indicators is packaged with [TensorFlow Data Validation (TFDV)](https://tensorflow.github.io/data-validation/get_started) and the [What-If Tool](https://pair-code.github.io/what-if-tool/). Using Fairness Indicators allows you to: \n", + "\n", + "* Evaluate model performance, sliced across defined groups of users\n", + "* Gain confidence about results with confidence intervals and evaluations at multiple thresholds\n", + "* Evaluate the distribution of datasets\n", + "* Dive deep into individual slices to explore root causes and opportunities for improvement\n", + "\n", + "In this notebook, you will use Fairness Indicators to fix fairness issues in a model you train using the [Civil Comments dataset](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification). Watch this [video](https://www.youtube.com/watch?v=pHT-ImFXPQo) for more details and context on the real-world scenario this is based on which is also one of primary motivations for creating Fairness Indicators." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GjuCFktB2IJW" + }, + "source": [ + "## Dataset\n", + "\n", + "In this notebook, you will work with the [Civil Comments dataset](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification), approximately 2 million public comments made public by the [Civil Comments platform](https://medium.com/@aja_15265/saying-goodbye-to-civil-comments-41859d3a2b1d) in 2017 for ongoing research. This effort was sponsored by [Jigsaw](https://jigsaw.google.com/), who have hosted competitions on Kaggle to help classify toxic comments as well as minimize unintended model bias.\n", + "\n", + "Each individual text comment in the dataset has a toxicity label, with the label being 1 if the comment is toxic and 0 if the comment is non-toxic. Within the data, a subset of comments are labeled with a variety of identity attributes, including categories for gender, sexual orientation, religion, and race or ethnicity." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u33JXdluZ2lG" + }, + "source": [ + "## Setup\n", + "\n", + "Install `fairness-indicators` and `witwidget`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EoRNffG599XP" + }, + "outputs": [], + "source": [ + "!pip install -q -U pip==20.2\n", + "\n", + "!pip install -q fairness-indicators\n", + "!pip install -q witwidget" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "alYUSbyv59j5" + }, + "source": [ + "You must restart the Colab runtime after installing. Select **Runtime > Restart** runtime from the Colab menu.\n", + "\n", + "Do not proceed with the rest of this tutorial without first restarting the runtime." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RbRUqXDm6f1N" + }, + "source": [ + "Import all other required libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B8dlyTyiTe-9" + }, + "outputs": [], + "source": [ + "import os\n", + "import tempfile\n", + "import apache_beam as beam\n", + "import numpy as np\n", + "import pandas as pd\n", + "from datetime import datetime\n", + "import pprint\n", + "\n", + "from google.protobuf import text_format\n", + "\n", + "import tensorflow_hub as hub\n", + "import tensorflow as tf\n", + "import tensorflow_model_analysis as tfma\n", + "import tensorflow_data_validation as tfdv\n", + "\n", + "from tfx_bsl.tfxio import tensor_adapter\n", + "from tfx_bsl.tfxio import tf_example_record\n", + "\n", + "from tensorflow_model_analysis.addons.fairness.post_export_metrics import fairness_indicators\n", + "from tensorflow_model_analysis.addons.fairness.view import widget_view\n", + "\n", + "from fairness_indicators.tutorial_utils import util\n", + "\n", + "from witwidget.notebook.visualization import WitConfigBuilder\n", + "from witwidget.notebook.visualization import WitWidget\n", + "\n", + "from tensorflow_metadata.proto.v0 import schema_pb2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TsplOJGqWCf5" + }, + "source": [ + "## Download and analyze the data\n", + "\n", + "By default, this notebook downloads a preprocessed version of this dataset, but you may use the original dataset and re-run the processing steps if desired. In the original dataset, each comment is labeled with the percentage of raters who believed that a comment corresponds to a particular identity. For example, a comment might be labeled with the following: { male: 0.3, female: 1.0, transgender: 0.0, heterosexual: 0.8, homosexual_gay_or_lesbian: 1.0 } The processing step groups identity by category (gender, sexual_orientation, etc.) and removes identities with a score less than 0.5. So the example above would be converted to the following: of raters who believed that a comment corresponds to a particular identity. For example, the comment would be labeled with the following: { gender: [female], sexual_orientation: [heterosexual, homosexual_gay_or_lesbian] }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qmt4gkBFRBD2" + }, + "outputs": [], + "source": [ + "download_original_data = False #@param {type:\"boolean\"}\n", + "\n", + "if download_original_data:\n", + " train_tf_file = tf.keras.utils.get_file('train_tf.tfrecord',\n", + " 'https://storage.googleapis.com/civil_comments_dataset/train_tf.tfrecord')\n", + " validate_tf_file = tf.keras.utils.get_file('validate_tf.tfrecord',\n", + " 'https://storage.googleapis.com/civil_comments_dataset/validate_tf.tfrecord')\n", + "\n", + " # The identity terms list will be grouped together by their categories\n", + " # (see 'IDENTITY_COLUMNS') on threshould 0.5. Only the identity term column,\n", + " # text column and label column will be kept after processing.\n", + " train_tf_file = util.convert_comments_data(train_tf_file)\n", + " validate_tf_file = util.convert_comments_data(validate_tf_file)\n", + "\n", + "else:\n", + " train_tf_file = tf.keras.utils.get_file('train_tf_processed.tfrecord',\n", + " 'https://storage.googleapis.com/civil_comments_dataset/train_tf_processed.tfrecord')\n", + " validate_tf_file = tf.keras.utils.get_file('validate_tf_processed.tfrecord',\n", + " 'https://storage.googleapis.com/civil_comments_dataset/validate_tf_processed.tfrecord')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vFOQ4AaIcAn2" + }, + "source": [ + "Use TFDV to analyze the data and find potential problems in it, such as missing values and data imbalances, that can lead to fairness disparities." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NdLBi6tN5i7I" + }, + "outputs": [], + "source": [ + "stats = tfdv.generate_statistics_from_tfrecord(data_location=train_tf_file)\n", + "tfdv.visualize_statistics(stats)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AS9QiA96GXDE" + }, + "source": [ + "TFDV shows that there are some significant imbalances in the data which could lead to biased model outcomes. \n", + "\n", + "* The toxicity label (the value predicted by the model) is unbalanced. Only 8% of the examples in the training set are toxic, which means that a classifier could get 92% accuracy by predicting that all comments are non-toxic.\n", + "\n", + "* In the fields relating to identity terms, only 6.6k out of the 1.08 million (0.61%) training examples deal with homosexuality, and those related to bisexuality are even more rare. This indicates that performance on these slices may suffer due to lack of training data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9ekzb7vVnPCc" + }, + "source": [ + "## Prepare the data\n", + "\n", + "Define a feature map to parse the data. Each example will have a label, comment text, and identity features `sexual orientation`, `gender`, `religion`, `race`, and `disability` that are associated with the text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n4_nXQDykX6W" + }, + "outputs": [], + "source": [ + "BASE_DIR = tempfile.gettempdir()\n", + "\n", + "TEXT_FEATURE = 'comment_text'\n", + "LABEL = 'toxicity'\n", + "FEATURE_MAP = {\n", + " # Label:\n", + " LABEL: tf.io.FixedLenFeature([], tf.float32),\n", + " # Text:\n", + " TEXT_FEATURE: tf.io.FixedLenFeature([], tf.string),\n", + "\n", + " # Identities:\n", + " 'sexual_orientation':tf.io.VarLenFeature(tf.string),\n", + " 'gender':tf.io.VarLenFeature(tf.string),\n", + " 'religion':tf.io.VarLenFeature(tf.string),\n", + " 'race':tf.io.VarLenFeature(tf.string),\n", + " 'disability':tf.io.VarLenFeature(tf.string),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1B1ROCM__y8C" + }, + "source": [ + "Next, set up an input function to feed data into the model. Add a weight column to each example and upweight the toxic examples to account for the class imbalance identified by the TFDV. Use only identity features during the evaluation phase, as only the comments are fed into the model during training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YwoC-dzEDid3" + }, + "outputs": [], + "source": [ + "def train_input_fn():\n", + " def parse_function(serialized):\n", + " parsed_example = tf.io.parse_single_example(\n", + " serialized=serialized, features=FEATURE_MAP)\n", + " # Adds a weight column to deal with unbalanced classes.\n", + " parsed_example['weight'] = tf.add(parsed_example[LABEL], 0.1)\n", + " return (parsed_example,\n", + " parsed_example[LABEL])\n", + " train_dataset = tf.data.TFRecordDataset(\n", + " filenames=[train_tf_file]).map(parse_function).batch(512)\n", + " return train_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mfbgerCsEOmN" + }, + "source": [ + "## Train the model\n", + "\n", + "Create and train a deep learning model on the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JaGvNrVijfws" + }, + "outputs": [], + "source": [ + "model_dir = os.path.join(BASE_DIR, 'train', datetime.now().strftime(\n", + " \"%Y%m%d-%H%M%S\"))\n", + "\n", + "embedded_text_feature_column = hub.text_embedding_column(\n", + " key=TEXT_FEATURE,\n", + " module_spec='https://tfhub.dev/google/nnlm-en-dim128/1')\n", + "\n", + "classifier = tf.estimator.DNNClassifier(\n", + " hidden_units=[500, 100],\n", + " weight_column='weight',\n", + " feature_columns=[embedded_text_feature_column],\n", + " optimizer=tf.keras.optimizers.legacy.Adagrad(learning_rate=0.003),\n", + " loss_reduction=tf.losses.Reduction.SUM,\n", + " n_classes=2,\n", + " model_dir=model_dir)\n", + "\n", + "classifier.train(input_fn=train_input_fn, steps=1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jTPqije9Eg5b" + }, + "source": [ + "## Analyze the model\n", + "\n", + "After obtaining the trained model, analyze it to compute fairness metrics using TFMA and Fairness Indicators. Begin by exporting the model as a [SavedModel](https://www.tensorflow.org/guide/saved_model). " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-vRc-Jyp8dRm" + }, + "source": [ + "### Export SavedModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QLjiy5VCzlRw" + }, + "outputs": [], + "source": [ + "def eval_input_receiver_fn():\n", + " serialized_tf_example = tf.compat.v1.placeholder(\n", + " dtype=tf.string, shape=[None], name='input_example_placeholder')\n", + "\n", + " # This *must* be a dictionary containing a single key 'examples', which\n", + " # points to the input placeholder.\n", + " receiver_tensors = {'examples': serialized_tf_example}\n", + "\n", + " features = tf.io.parse_example(serialized_tf_example, FEATURE_MAP)\n", + " features['weight'] = tf.ones_like(features[LABEL])\n", + "\n", + " return tfma.export.EvalInputReceiver(\n", + " features=features,\n", + " receiver_tensors=receiver_tensors,\n", + " labels=features[LABEL])\n", + "\n", + "tfma_export_dir = tfma.export.export_eval_savedmodel(\n", + " estimator=classifier,\n", + " export_dir_base=os.path.join(BASE_DIR, 'tfma_eval_model'),\n", + " eval_input_receiver_fn=eval_input_receiver_fn)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3j8ODcee8rQ8" + }, + "source": [ + "### Compute Fairness Metrics\n", + "\n", + "Select the identity to compute metrics for and whether to run with confidence intervals using the dropdown in the panel on the right." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7shDmJbx9mqa" + }, + "outputs": [], + "source": [ + "#@title Fairness Indicators Computation Options\n", + "tfma_eval_result_path = os.path.join(BASE_DIR, 'tfma_eval_result')\n", + "\n", + "#@markdown Modify the slice_selection for experiments on other identities.\n", + "slice_selection = 'sexual_orientation' #@param [\"sexual_orientation\", \"gender\", \"religion\", \"race\", \"disability\"]\n", + "print(f'Slice selection: {slice_selection}')\n", + "#@markdown Confidence Intervals can help you make better decisions regarding your data, but as it requires computing multiple resamples, is slower particularly in the colab environment that cannot take advantage of parallelization.\n", + "compute_confidence_intervals = False #@param {type:\"boolean\"}\n", + "print(f'Compute confidence intervals: {compute_confidence_intervals}')\n", + "\n", + "# Define slices that you want the evaluation to run on.\n", + "eval_config_pbtxt = \"\"\"\n", + " model_specs {\n", + " label_key: \"%s\"\n", + " }\n", + " metrics_specs {\n", + " metrics {\n", + " class_name: \"FairnessIndicators\"\n", + " config: '{ \"thresholds\": [0.1, 0.3, 0.5, 0.7, 0.9] }'\n", + " }\n", + " }\n", + " slicing_specs {} # overall slice\n", + " slicing_specs {\n", + " feature_keys: [\"%s\"]\n", + " }\n", + " options {\n", + " compute_confidence_intervals { value: %s }\n", + " disabled_outputs { values: \"analysis\" }\n", + " }\n", + " \"\"\" % (LABEL, slice_selection, compute_confidence_intervals)\n", + "eval_config = text_format.Parse(eval_config_pbtxt, tfma.EvalConfig())\n", + "eval_shared_model = tfma.default_eval_shared_model(\n", + " eval_saved_model_path=tfma_export_dir)\n", + "\n", + "schema = text_format.Parse(\n", + " \"\"\"\n", + " tensor_representation_group {\n", + " key: \"\"\n", + " value {\n", + " tensor_representation {\n", + " key: \"comment_text\"\n", + " value {\n", + " dense_tensor {\n", + " column_name: \"comment_text\"\n", + " shape {}\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " feature {\n", + " name: \"comment_text\"\n", + " type: BYTES\n", + " }\n", + " feature {\n", + " name: \"toxicity\"\n", + " type: FLOAT\n", + " }\n", + " feature {\n", + " name: \"sexual_orientation\"\n", + " type: BYTES\n", + " }\n", + " feature {\n", + " name: \"gender\"\n", + " type: BYTES\n", + " }\n", + " feature {\n", + " name: \"religion\"\n", + " type: BYTES\n", + " }\n", + " feature {\n", + " name: \"race\"\n", + " type: BYTES\n", + " }\n", + " feature {\n", + " name: \"disability\"\n", + " type: BYTES\n", + " }\n", + " \"\"\", schema_pb2.Schema())\n", + "tfxio = tf_example_record.TFExampleRecord(\n", + " file_pattern=validate_tf_file,\n", + " schema=schema,\n", + " raw_record_column_name=tfma.ARROW_INPUT_COLUMN)\n", + "tensor_adapter_config = tensor_adapter.TensorAdapterConfig(\n", + " arrow_schema=tfxio.ArrowSchema(),\n", + " tensor_representations=tfxio.TensorRepresentations())\n", + "\n", + "with beam.Pipeline() as pipeline:\n", + " (pipeline\n", + " | 'ReadFromTFRecordToArrow' >> tfxio.BeamSource()\n", + " | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults(\n", + " eval_config=eval_config,\n", + " eval_shared_model=eval_shared_model,\n", + " output_path=tfma_eval_result_path,\n", + " tensor_adapter_config=tensor_adapter_config))\n", + "\n", + "eval_result = tfma.load_eval_result(output_path=tfma_eval_result_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jtDpTBPeRw2d" + }, + "source": [ + "### Visualize data using the What-if Tool\n", + "\n", + "In this section, you'll use the What-If Tool's interactive visual interface to explore and manipulate data at a micro-level.\n", + "\n", + "Each point on the scatter plot on the right-hand panel represents one of the examples in the subset loaded into the tool. Click on one of the points to see details about this particular example in the left-hand panel. The comment text, ground truth toxicity, and applicable identities are shown. At the bottom of this left-hand panel, you see the inference results from the model you just trained.\n", + "\n", + "Modify the text of the example and then click the **Run inference** button to view how your changes caused the perceived toxicity prediction to change." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wtjZo4BDlV1m" + }, + "outputs": [], + "source": [ + "DEFAULT_MAX_EXAMPLES = 1000\n", + "\n", + "# Load 100000 examples in memory. When first rendered, \n", + "# What-If Tool should only display 1000 of these due to browser constraints.\n", + "def wit_dataset(file, num_examples=100000):\n", + " dataset = tf.data.TFRecordDataset(\n", + " filenames=[file]).take(num_examples)\n", + " return [tf.train.Example.FromString(d.numpy()) for d in dataset]\n", + "\n", + "wit_data = wit_dataset(train_tf_file)\n", + "config_builder = WitConfigBuilder(wit_data[:DEFAULT_MAX_EXAMPLES]).set_estimator_and_feature_spec(\n", + " classifier, FEATURE_MAP).set_label_vocab(['non-toxicity', LABEL]).set_target_feature(LABEL)\n", + "wit = WitWidget(config_builder)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktlASJQIzE3l" + }, + "source": [ + "## Render Fairness Indicators\n", + "\n", + "Render the Fairness Indicators widget with the exported evaluation results.\n", + "\n", + "Below you will see bar charts displaying performance of each slice of the data on selected metrics. You can adjust the baseline comparison slice as well as the displayed threshold(s) using the dropdown menus at the top of the visualization. \n", + "\n", + "The Fairness Indicator widget is integrated with the What-If Tool rendered above. If you select one slice of the data in the bar chart, the What-If Tool will update to show you examples from the selected slice. When the data reloads in the What-If Tool above, try modifying **Color By** to **toxicity**. This can give you a visual understanding of the toxicity balance of examples by slice." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JNaNhTCTAMHm" + }, + "outputs": [], + "source": [ + "event_handlers={'slice-selected':\n", + " wit.create_selection_callback(wit_data, DEFAULT_MAX_EXAMPLES)}\n", + "widget_view.render_fairness_indicator(eval_result=eval_result,\n", + " slicing_column=slice_selection,\n", + " event_handlers=event_handlers\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nRuZsLr6V_fY" + }, + "source": [ + "With this particular dataset and task, systematically higher false positive and false negative rates for certain identities can lead to negative consequences. For example, in a content moderation system, a higher-than-overall false positive rate for a certain group can lead to those voices being silenced. Thus, it is important to regularly evaluate these types of criteria as you develop and improve models, and utilize tools such as Fairness Indicators, TFDV, and WIT to help illuminate potential problems. Once you've identified fairness issues, you can experiment with new data sources, data balancing, or other techniques to improve performance on underperforming groups.\n", + "\n", + "See [here](../../guide/guidance) for more information and guidance on how to use Fairness Indicators.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wCMEMtGfx0Ti" + }, + "source": [ + "## Use fairness evaluation results\n", + "\n", + "The [`eval_result`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResult) object, rendered above in `render_fairness_indicator()`, has its own API that you can leverage to read TFMA results into your programs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z6stkMLwyfza" + }, + "source": [ + "### Get evaluated slices and metrics\n", + "\n", + "Use [`get_slice_names()`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResult.get_slice_names) and [`get_metric_names()`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResult.get_metric_names) to get the evaluated slices and metrics, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eXrt7SdZyzWD" + }, + "outputs": [], + "source": [ + "pp = pprint.PrettyPrinter()\n", + "\n", + "print(\"Slices:\")\n", + "pp.pprint(eval_result.get_slice_names())\n", + "print(\"\\nMetrics:\")\n", + "pp.pprint(eval_result.get_metric_names())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ctAvudY2zUu4" + }, + "source": [ + "Use [`get_metrics_for_slice()`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResultget_metrics_for_slice) to get the metrics for a particular slice as a dictionary mapping metric names to [metric values](https://github.com/tensorflow/model-analysis/blob/cdb6790dcd7a37c82afb493859b3ef4898963fee/tensorflow_model_analysis/proto/metrics_for_slice.proto#L194)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zjCxZGHmzF0R" + }, + "outputs": [], + "source": [ + "baseline_slice = ()\n", + "heterosexual_slice = (('sexual_orientation', 'heterosexual'),)\n", + "\n", + "print(\"Baseline metric values:\")\n", + "pp.pprint(eval_result.get_metrics_for_slice(baseline_slice))\n", + "print(\"\\nHeterosexual metric values:\")\n", + "pp.pprint(eval_result.get_metrics_for_slice(heterosexual_slice))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UDo3LhoR0Rq1" + }, + "source": [ + "Use [`get_metrics_for_all_slices()`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResult.get_metrics_for_all_slices) to get the metrics for all slices as a dictionary mapping each slice to the corresponding metrics dictionary you obtain from running `get_metrics_for_slice()` on it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "96N2l2xI0fZd" + }, + "outputs": [], + "source": [ + "pp.pprint(eval_result.get_metrics_for_all_slices())" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "Fairness Indicators Example Colab.ipynb", + "private_outputs": true, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb b/docs/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb new file mode 100644 index 00000000..afa3ad15 --- /dev/null +++ b/docs/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb @@ -0,0 +1,499 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Bfrh3DUze0QN" + }, + "source": [ + "##### Copyright 2020 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx-jnufYfcJG" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s1bQihY6-Y4N" + }, + "source": [ + "# Pandas DataFrame to Fairness Indicators Case Study\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XHTjeiUMeolM" + }, + "source": [ + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ay80altXzvgZ" + }, + "source": [ + "## Case Study Overview\n", + "In this case study we will apply [TensorFlow Model Analysis](https://tensorflow.github.io/model-analysis/get_started) and [Fairness Indicators](https://tensorflow.github.io/fairness-indicators) to evaluate data stored as a Pandas DataFrame, where each row contains ground truth labels, various features, and a model prediction. We will show how this workflow can be used to spot potential fairness concerns, independent of the framework one used to construct and train the model. As in this case study, we can analyze the results from any machine learning framework (e.g. TensorFlow, JAX, etc) once they are converted to a Pandas DataFrame.\n", + " \n", + "For this exercise, we will leverage the Deep Neural Network (DNN) model that was developed in the [Shape Constraints for Ethics with Tensorflow Lattice](https://colab.research.google.com/github/tensorflow/lattice/blob/master/docs/tutorials/shape_constraints_for_ethics.ipynb#scrollTo=uc0VwsT5nvQi) case study using the Law School Admissions dataset from the Law School Admissions Council (LSAC). This classifier attempts to predict whether or not a student will pass the bar, based on their Law School Admission Test (LSAT) score and undergraduate GPA.\n", + "\n", + "## LSAC Dataset\n", + "The dataset used within this case study was originally collected for a study called '[LSAC National Longitudinal Bar Passage Study. LSAC Research Report Series](https://eric.ed.gov/?id=ED469370)' by Linda Wightman in 1998. The dataset is currently hosted [here](http://www.seaphe.org/databases.php).\n", + "\n", + "* **dnn_bar_pass_prediction**: The LSAT prediction from the DNN model.\n", + "* **gender**: Gender of the student.\n", + "* **lsat**: LSAT score received by the student.\n", + "* **pass_bar**: Ground truth label indicating whether or not the student eventually passed the bar.\n", + "* **race**: Race of the student.\n", + "* **ugpa**: A student's undergraduate GPA.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ob01ASKqixfw" + }, + "outputs": [], + "source": [ + "!pip install -q -U pip==20.2\n", + "\n", + "!pip install -q -U \\\n", + " tensorflow-model-analysis==0.48.0 \\\n", + " tensorflow-data-validation==1.17.0 \\\n", + " tfx-bsl==1.17.1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tnxSvgkaSEIj" + }, + "source": [ + "## Importing required packages:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0q8cTfpTkEMP" + }, + "outputs": [], + "source": [ + "import os\n", + "import tempfile\n", + "import pandas as pd\n", + "import six.moves.urllib as urllib\n", + "import pprint\n", + "\n", + "import tensorflow_model_analysis as tfma\n", + "from google.protobuf import text_format\n", + "\n", + "import tensorflow as tf\n", + "tf.compat.v1.enable_v2_behavior()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b8kWW3t4-eS1" + }, + "source": [ + "## Download the data and explore the initial dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wMZJtgj0qJ0x" + }, + "outputs": [], + "source": [ + "# Download the LSAT dataset and setup the required filepaths.\n", + "_DATA_ROOT = tempfile.mkdtemp(prefix='lsat-data')\n", + "_DATA_PATH = 'https://storage.googleapis.com/lawschool_dataset/bar_pass_prediction.csv'\n", + "_DATA_FILEPATH = os.path.join(_DATA_ROOT, 'bar_pass_prediction.csv')\n", + "\n", + "data = urllib.request.urlopen(_DATA_PATH)\n", + "\n", + "_LSAT_DF = pd.read_csv(data)\n", + "\n", + "# To simpliy the case study, we will only use the columns that will be used for\n", + "# our model.\n", + "_COLUMN_NAMES = [\n", + " 'dnn_bar_pass_prediction',\n", + " 'gender',\n", + " 'lsat',\n", + " 'pass_bar',\n", + " 'race1',\n", + " 'ugpa',\n", + "]\n", + "\n", + "_LSAT_DF.dropna()\n", + "_LSAT_DF['gender'] = _LSAT_DF['gender'].astype(str)\n", + "_LSAT_DF['race1'] = _LSAT_DF['race1'].astype(str)\n", + "_LSAT_DF = _LSAT_DF[_COLUMN_NAMES]\n", + "\n", + "_LSAT_DF.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GyeVg2s7-wlB" + }, + "source": [ + "## Configure Fairness Indicators.\n", + "There are several parameters that you’ll need to take into account when using Fairness Indicators with a DataFrame \n", + "\n", + "* Your input DataFrame must contain a prediction column and label column from your model. By default Fairness Indicators will look for a prediction column called `prediction` and a label column called `label` within your DataFrame.\n", + " * If either of these values are not found a KeyError will be raised.\n", + "\n", + "* In addition to a DataFrame, you’ll also need to include an `eval_config` that should include the metrics to compute, slices to compute the metrics on, and the column names for example labels and predictions. \n", + " * `metrics_specs` will set the metrics to compute. The `FairnessIndicators` metric will be required to render the fairness metrics and you can see a list of additional optional metrics [here](https://tensorflow.github.io/model-analysis/metrics).\n", + "\n", + " * `slicing_specs` is an optional slicing parameter to specify what feature you’re interested in investigating. Within this case study race1 is used, however you can also set this value to another feature (for example gender in the context of this DataFrame). If `slicing_specs` is not provided all features will be included.\n", + " * If your DataFrame includes a label or prediction column that is different from the default `prediction` or `label`, you can configure the `label_key` and `prediction_key` to a new value.\n", + "\n", + "* If `output_path` is not specified a temporary directory will be created." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "53caFasB5V9p" + }, + "outputs": [], + "source": [ + "# Specify Fairness Indicators in eval_config.\n", + "eval_config = text_format.Parse(\"\"\"\n", + " model_specs {\n", + " prediction_key: 'dnn_bar_pass_prediction',\n", + " label_key: 'pass_bar'\n", + " }\n", + " metrics_specs {\n", + " metrics {class_name: \"AUC\"}\n", + " metrics {\n", + " class_name: \"FairnessIndicators\"\n", + " config: '{\"thresholds\": [0.50, 0.90]}'\n", + " }\n", + " }\n", + " slicing_specs {\n", + " feature_keys: 'race1'\n", + " }\n", + " slicing_specs {}\n", + " \"\"\", tfma.EvalConfig())\n", + "\n", + "# Run TensorFlow Model Analysis.\n", + "eval_result = tfma.analyze_raw_data(\n", + " data=_LSAT_DF,\n", + " eval_config=eval_config,\n", + " output_path=_DATA_ROOT)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KD96mw0e--DE" + }, + "source": [ + "## Explore model performance with Fairness Indicators.\n", + "\n", + "After running Fairness Indicators, we can visualize different metrics that we selected to analyze our models performance. Within this case study we’ve included Fairness Indicators and arbitrarily picked AUC.\n", + "\n", + "When we first look at the overall AUC for each race slice we can see a slight discrepancy in model performance, but nothing that is arguably alarming.\n", + "\n", + "* **Asian**: 0.58\n", + "* **Black**: 0.58\n", + "* **Hispanic**: 0.58\n", + "* **Other**: 0.64\n", + "* **White**: 0.6\n", + "\n", + "However, when we look at the false negative rates split by race, our model again incorrectly predicts the likelihood of a user passing the bar at different rates and, this time, does so by a lot. \n", + "\n", + "* **Asian**: 0.01\n", + "* **Black**: 0.05\n", + "* **Hispanic**: 0.02\n", + "* **Other**: 0.01\n", + "* **White**: 0.01\n", + "\n", + "Most notably the difference between Black and White students is about 380%, meaning that our model is nearly 4x more likely to incorrectly predict that a black student will not pass the bar, than a whilte student. If we were to continue with this effort, a practitioner could use these results as a signal that they should spend more time ensuring that their model works well for people from all backgrounds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NIdchYPb-_ZV" + }, + "outputs": [], + "source": [ + "# Render Fairness Indicators.\n", + "tfma.addons.fairness.view.widget_view.render_fairness_indicator(eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NprhBTCbY1sF" + }, + "source": [ + "# tfma.EvalResult" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6f92-e98Y40r" + }, + "source": [ + "The [`eval_result`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResult) object, rendered above in `render_fairness_indicator()`, has its own API that can be used to read TFMA results into your programs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CDDUxdx-Y8e0" + }, + "source": [ + "## [`get_slice_names()`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResult.get_slice_names) and [`get_metric_names()`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResult.get_metric_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oG_mNUNbY98t" + }, + "source": [ + "To get the evaluated slices and metrics, you can use the respective functions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kbA1sXhCY_G7" + }, + "outputs": [], + "source": [ + "pp = pprint.PrettyPrinter()\n", + "\n", + "print(\"Slices:\")\n", + "pp.pprint(eval_result.get_slice_names())\n", + "print(\"\\nMetrics:\")\n", + "pp.pprint(eval_result.get_metric_names())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rA1M8aBmZAk6" + }, + "source": [ + "## [`get_metrics_for_slice()`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResult.get_metrics_for_slice) and [`get_metrics_for_all_slices()`](https://tensorflow.github.io/model-analysis/api_docs/python/tfma/#tensorflow_model_analysis.EvalResult.get_metrics_for_all_slices)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a3Ath5MsZCRX" + }, + "source": [ + "If you want to get the metrics for a particular slice, you can use `get_metrics_for_slice()`. It returns a dictionary mapping metric names to [metric values](https://github.com/tensorflow/model-analysis/blob/cdb6790dcd7a37c82afb493859b3ef4898963fee/tensorflow_model_analysis/proto/metrics_for_slice.proto#L194)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9BWg5HoyZDh-" + }, + "outputs": [], + "source": [ + "baseline_slice = ()\n", + "black_slice = (('race1', 'black'),)\n", + "\n", + "print(\"Baseline metric values:\")\n", + "pp.pprint(eval_result.get_metrics_for_slice(baseline_slice))\n", + "print(\"Black metric values:\")\n", + "pp.pprint(eval_result.get_metrics_for_slice(black_slice))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDcOxvqBZEfg" + }, + "source": [ + "If you want to get the metrics for all slices, `get_metrics_for_all_slices()` returns a dictionary mapping each slice to the corresponding `get_metrics_for_slices(slice)`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "p4NQCi52ZFrw" + }, + "outputs": [], + "source": [ + "pp.pprint(eval_result.get_metrics_for_all_slices())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y-nbqnSTkmW3" + }, + "source": [ + "## Conclusion\n", + "Within this case study we imported a dataset into a Pandas DataFrame that we then analyzed with Fairness Indicators. Understanding the results of your model and underlying data is an important step in ensuring your model doesn't reflect harmful bias. In the context of this case study we examined the the LSAC dataset and how predictions from this data could be impacted by a students race. The concept of “what is unfair and what is fair have been introduced in multiple disciplines for well over 50 years, including in education, hiring, and machine learning.”1 Fairness Indicator is a tool to help mitigate fairness concerns in your machine learning model.\n", + "\n", + "For more information on using Fairness Indicators and resources to learn more about fairness concerns see [here](../../).\n", + "\n", + "---\n", + "\n", + "1. Hutchinson, B., Mitchell, M. (2018). 50 Years of Test (Un)fairness: Lessons for Machine Learning. https://arxiv.org/abs/1811.10104\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "REV1rBnoBAo1" + }, + "source": [ + "## Appendix\n", + "\n", + "Below are a few functions to help convert ML models to Pandas DataFrame.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F4qv9GXiBsFA" + }, + "outputs": [], + "source": [ + "# TensorFlow Estimator to Pandas DataFrame:\n", + "\n", + "# _X_VALUE = # X value of binary estimator.\n", + "# _Y_VALUE = # Y value of binary estimator.\n", + "# _GROUND_TRUTH_LABEL = # Ground truth value of binary estimator.\n", + "\n", + "def _get_predicted_probabilities(estimator, input_df, get_input_fn):\n", + " predictions = estimator.predict(\n", + " input_fn=get_input_fn(input_df=input_df, num_epochs=1))\n", + " return [prediction['probabilities'][1] for prediction in predictions]\n", + "\n", + "def _get_input_fn_law(input_df, num_epochs, batch_size=None):\n", + " return tf.compat.v1.estimator.inputs.pandas_input_fn(\n", + " x=input_df[[_X_VALUE, _Y_VALUE]],\n", + " y=input_df[_GROUND_TRUTH_LABEL],\n", + " num_epochs=num_epochs,\n", + " batch_size=batch_size or len(input_df),\n", + " shuffle=False)\n", + "\n", + "def estimator_to_dataframe(estimator, input_df, num_keypoints=20):\n", + " x = np.linspace(min(input_df[_X_VALUE]), max(input_df[_X_VALUE]), num_keypoints)\n", + " y = np.linspace(min(input_df[_Y_VALUE]), max(input_df[_Y_VALUE]), num_keypoints)\n", + "\n", + " x_grid, y_grid = np.meshgrid(x, y)\n", + "\n", + " positions = np.vstack([x_grid.ravel(), y_grid.ravel()])\n", + " plot_df = pd.DataFrame(positions.T, columns=[_X_VALUE, _Y_VALUE])\n", + " plot_df[_GROUND_TRUTH_LABEL] = np.ones(len(plot_df))\n", + " predictions = _get_predicted_probabilities(\n", + " estimator=estimator, input_df=plot_df, get_input_fn=_get_input_fn_law)\n", + " return pd.DataFrame(\n", + " data=np.array(np.reshape(predictions, x_grid.shape)).flatten())" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "Bfrh3DUze0QN" + ], + "name": "Pandas DataFrame to Fairness Indicators Case Study", + "private_outputs": true, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb b/docs/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb new file mode 100644 index 00000000..a702648d --- /dev/null +++ b/docs/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb @@ -0,0 +1,1012 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "JmvzTcYice-_" + }, + "source": [ + "##### Copyright 2020 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zlvAS8a9cD_t" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b2VYQpTttmVN" + }, + "source": [ + "# TensorFlow Constrained Optimization Example Using CelebA Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3iFsS2WSeRwe" + }, + "source": [ + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-DQoReGDeN16" + }, + "source": [ + "This notebook demonstrates an easy way to create and optimize constrained problems using the TFCO library. This method can be useful in improving models when we find that they’re not performing equally well across different slices of our data, which we can identify using [Fairness Indicators](../../). The second of Google’s AI principles states that our technology should avoid creating or reinforcing unfair bias, and we believe this technique can help improve model fairness in some situations. In particular, this notebook will:\n", + "\n", + "\n", + "* Train a simple, *unconstrained* neural network model to detect a person's smile in images using [`tf.keras`](https://www.tensorflow.org/guide/keras) and the large-scale CelebFaces Attributes ([CelebA](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html)) dataset.\n", + "* Evaluate model performance against a commonly used fairness metric across age groups, using Fairness Indicators.\n", + "* Set up a simple constrained optimization problem to achieve fairer performance across age groups.\n", + "* Retrain the now *constrained* model and evaluate performance again, ensuring that our chosen fairness metric has improved.\n", + "\n", + "Last updated: 3/11 Feb 2020" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JyCbEWt5Zxe2" + }, + "source": [ + "# Installation\n", + "This notebook was created in [Colaboratory](https://research.google.com/colaboratory/faq.html), connected to the Python 3 Google Compute Engine backend. If you wish to host this notebook in a different environment, then you should not experience any major issues provided you include all the required packages in the cells below.\n", + "\n", + "Note that the very first time you run the pip installs, you may be asked to restart the runtime because of preinstalled out of date packages. Once you do so, the correct packages will be used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T-Zm-KDdt0bn" + }, + "outputs": [], + "source": [ + "#@title Pip installs\n", + "!pip install -q -U pip==20.2\n", + "\n", + "!pip install git+https://github.com/google-research/tensorflow_constrained_optimization\n", + "!pip install -q tensorflow-datasets tensorflow\n", + "!pip install fairness-indicators \\\n", + " \"absl-py==0.12.0\" \\\n", + " \"apache-beam<3,>=2.47\" \\\n", + " \"avro-python3==1.9.1\" \\\n", + " \"pyzmq==17.0.0\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UXWXhBLvISOY" + }, + "source": [ + "Note that depending on when you run the cell below, you may receive a warning about the default version of TensorFlow in Colab switching to TensorFlow 2.X soon. You can safely ignore that warning as this notebook was designed to be compatible with TensorFlow 1.X and 2.X." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UTBBdSGaZ8aW" + }, + "outputs": [], + "source": [ + "#@title Import Modules\n", + "import os\n", + "import sys\n", + "import tempfile\n", + "import urllib\n", + "\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "\n", + "import tensorflow_datasets as tfds\n", + "tfds.disable_progress_bar()\n", + "\n", + "import numpy as np\n", + "\n", + "import tensorflow_constrained_optimization as tfco\n", + "\n", + "from tensorflow_metadata.proto.v0 import schema_pb2\n", + "from tfx_bsl.tfxio import tensor_adapter\n", + "from tfx_bsl.tfxio import tf_example_record" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "70tLum8uIZUm" + }, + "source": [ + "Additionally, we add a few imports that are specific to Fairness Indicators which we will use to evaluate and visualize the model's performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "7Se0Z0Bo9K-5" + }, + "outputs": [], + "source": [ + "#@title Fairness Indicators related imports\n", + "import tensorflow_model_analysis as tfma\n", + "import fairness_indicators as fi\n", + "from google.protobuf import text_format\n", + "import apache_beam as beam" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xSG2HP7goGrj" + }, + "source": [ + "Although TFCO is compatible with eager and graph execution, this notebook assumes that eager execution is enabled by default as it is in TensorFlow 2.x. To ensure that nothing breaks, eager execution will be enabled in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W0ZusW1-lBao" + }, + "outputs": [], + "source": [ + "#@title Enable Eager Execution and Print Versions\n", + "if tf.__version__ < \"2.0.0\":\n", + " tf.compat.v1.enable_eager_execution()\n", + " print(\"Eager execution enabled.\")\n", + "else:\n", + " print(\"Eager execution enabled by default.\")\n", + "\n", + "print(\"TensorFlow \" + tf.__version__)\n", + "print(\"TFMA \" + tfma.VERSION_STRING)\n", + "print(\"TFDS \" + tfds.version.__version__)\n", + "print(\"FI \" + fi.version.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "idY3Uuk3yvty" + }, + "source": [ + "# CelebA Dataset\n", + "[CelebA](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) is a large-scale face attributes dataset with more than 200,000 celebrity images, each with 40 attribute annotations (such as hair type, fashion accessories, facial features, etc.) and 5 landmark locations (eyes, mouth and nose positions). For more details take a look at [the paper](https://liuziwei7.github.io/projects/FaceAttributes.html).\n", + "With the permission of the owners, we have stored this dataset on Google Cloud Storage and mostly access it via [TensorFlow Datasets(`tfds`)](https://www.tensorflow.org/datasets).\n", + "\n", + "In this notebook:\n", + "* Our model will attempt to classify whether the subject of the image is smiling, as represented by the \"Smiling\" attribute*.\n", + "* Images will be resized from 218x178 to 28x28 to reduce the execution time and memory when training.\n", + "* Our model's performance will be evaluated across age groups, using the binary \"Young\" attribute. We will call this \"age group\" in this notebook.\n", + "\n", + "___\n", + "\n", + "* While there is little information available about the labeling methodology for this dataset, we will assume that the \"Smiling\" attribute was determined by a pleased, kind, or amused expression on the subject's face. For the purpose of this case study, we will take these labels as ground truth.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zCSemFST0b89" + }, + "outputs": [], + "source": [ + "gcs_base_dir = \"gs://celeb_a_dataset/\"\n", + "celeb_a_builder = tfds.builder(\"celeb_a\", data_dir=gcs_base_dir, version='2.0.0')\n", + "\n", + "celeb_a_builder.download_and_prepare()\n", + "\n", + "num_test_shards_dict = {'0.3.0': 4, '2.0.0': 2} # Used because we download the test dataset separately\n", + "version = str(celeb_a_builder.info.version)\n", + "print('Celeb_A dataset version: %s' % version)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "Ocqv3R06APfW" + }, + "outputs": [], + "source": [ + "#@title Test dataset helper functions\n", + "local_root = tempfile.mkdtemp(prefix='test-data')\n", + "def local_test_filename_base():\n", + " return local_root\n", + "\n", + "def local_test_file_full_prefix():\n", + " return os.path.join(local_test_filename_base(), \"celeb_a-test.tfrecord\")\n", + "\n", + "def copy_test_files_to_local():\n", + " filename_base = local_test_file_full_prefix()\n", + " num_test_shards = num_test_shards_dict[version]\n", + " for shard in range(num_test_shards):\n", + " url = \"https://storage.googleapis.com/celeb_a_dataset/celeb_a/%s/celeb_a-test.tfrecord-0000%s-of-0000%s\" % (version, shard, num_test_shards)\n", + " filename = \"%s-0000%s-of-0000%s\" % (filename_base, shard, num_test_shards)\n", + " res = urllib.request.urlretrieve(url, filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u5PDLXZb_uIj" + }, + "source": [ + "## Caveats\n", + "Before moving forward, there are several considerations to keep in mind in using CelebA:\n", + "* Although in principle this notebook could use any dataset of face images, CelebA was chosen because it contains public domain images of public figures.\n", + "* All of the attribute annotations in CelebA are operationalized as binary categories. For example, the \"Young\" attribute (as determined by the dataset labelers) is denoted as either present or absent in the image.\n", + "* CelebA's categorizations do not reflect real human diversity of attributes.\n", + "* For the purposes of this notebook, the feature containing the \"Young\" attribute is referred to as \"age group\", where the presence of the \"Young\" attribute in an image is labeled as a member of the \"Young\" age group and the absence of the \"Young\" attribute is labeled as a member of the \"Not Young\" age group. These are assumptions made as this information is not mentioned in the [original paper](http://openaccess.thecvf.com/content_iccv_2015/html/Liu_Deep_Learning_Face_ICCV_2015_paper.html).\n", + "* As such, performance in the models trained in this notebook is tied to the ways the attributes have been operationalized and annotated by the authors of CelebA.\n", + "* This model should not be used for commercial purposes as that would violate [CelebA's non-commercial research agreement](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Elkiu92cY2bY" + }, + "source": [ + "# Setting Up Input Functions\n", + "The subsequent cells will help streamline the input pipeline as well as visualize performance.\n", + "\n", + "First we define some data-related variables and define a requisite preprocessing function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gDdarTZxk6y4" + }, + "outputs": [], + "source": [ + "#@title Define Variables\n", + "ATTR_KEY = \"attributes\"\n", + "IMAGE_KEY = \"image\"\n", + "LABEL_KEY = \"Smiling\"\n", + "GROUP_KEY = \"Young\"\n", + "IMAGE_SIZE = 28" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "SD-H70Je0cTp" + }, + "outputs": [], + "source": [ + "#@title Define Preprocessing Functions\n", + "def preprocess_input_dict(feat_dict):\n", + " # Separate out the image and target variable from the feature dictionary.\n", + " image = feat_dict[IMAGE_KEY]\n", + " label = feat_dict[ATTR_KEY][LABEL_KEY]\n", + " group = feat_dict[ATTR_KEY][GROUP_KEY]\n", + "\n", + " # Resize and normalize image.\n", + " image = tf.cast(image, tf.float32)\n", + " image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])\n", + " image /= 255.0\n", + "\n", + " # Cast label and group to float32.\n", + " label = tf.cast(label, tf.float32)\n", + " group = tf.cast(group, tf.float32)\n", + "\n", + " feat_dict[IMAGE_KEY] = image\n", + " feat_dict[ATTR_KEY][LABEL_KEY] = label\n", + " feat_dict[ATTR_KEY][GROUP_KEY] = group\n", + "\n", + " return feat_dict\n", + "\n", + "get_image_and_label = lambda feat_dict: (feat_dict[IMAGE_KEY], feat_dict[ATTR_KEY][LABEL_KEY])\n", + "get_image_label_and_group = lambda feat_dict: (feat_dict[IMAGE_KEY], feat_dict[ATTR_KEY][LABEL_KEY], feat_dict[ATTR_KEY][GROUP_KEY])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iwg3sPmExciD" + }, + "source": [ + "Then, we build out the data functions we need in the rest of the colab." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KbR64r0VVG5h" + }, + "outputs": [], + "source": [ + "# Train data returning either 2 or 3 elements (the third element being the group)\n", + "def celeb_a_train_data_wo_group(batch_size):\n", + " celeb_a_train_data = celeb_a_builder.as_dataset(split='train').shuffle(1024).repeat().batch(batch_size).map(preprocess_input_dict)\n", + " return celeb_a_train_data.map(get_image_and_label)\n", + "def celeb_a_train_data_w_group(batch_size):\n", + " celeb_a_train_data = celeb_a_builder.as_dataset(split='train').shuffle(1024).repeat().batch(batch_size).map(preprocess_input_dict)\n", + " return celeb_a_train_data.map(get_image_label_and_group)\n", + "\n", + "# Test data for the overall evaluation\n", + "celeb_a_test_data = celeb_a_builder.as_dataset(split='test').batch(1).map(preprocess_input_dict).map(get_image_label_and_group)\n", + "# Copy test data locally to be able to read it into tfma\n", + "copy_test_files_to_local()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NXO3woTxiCk0" + }, + "source": [ + "# Build a simple DNN Model\n", + "Because this notebook focuses on TFCO, we will assemble a simple, unconstrained `tf.keras.Sequential` model.\n", + "\n", + "We may be able to greatly improve model performance by adding some complexity (e.g., more densely-connected layers, exploring different activation functions, increasing image size), but that may distract from the goal of demonstrating how easy it is to apply the TFCO library when working with Keras. For that reason, the model will be kept simple — but feel encouraged to explore this space." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RNZhN_zU8DRD" + }, + "outputs": [], + "source": [ + "def create_model():\n", + " # For this notebook, accuracy will be used to evaluate performance.\n", + " METRICS = [\n", + " tf.keras.metrics.BinaryAccuracy(name='accuracy')\n", + " ]\n", + "\n", + " # The model consists of:\n", + " # 1. An input layer that represents the 28x28x3 image flatten.\n", + " # 2. A fully connected layer with 64 units activated by a ReLU function.\n", + " # 3. A single-unit readout layer to output real-scores instead of probabilities.\n", + " model = keras.Sequential([\n", + " keras.layers.Flatten(input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3), name='image'),\n", + " keras.layers.Dense(64, activation='relu'),\n", + " keras.layers.Dense(1, activation=None)\n", + " ])\n", + "\n", + " # TFCO by default uses hinge loss — and that will also be used in the model.\n", + " model.compile(\n", + " optimizer=tf.keras.optimizers.Adam(0.001),\n", + " loss='hinge',\n", + " metrics=METRICS)\n", + " return model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7A4uKPNVzPVO" + }, + "source": [ + "We also define a function to set seeds to ensure reproducible results. Note that this colab is meant as an educational tool and does not have the stability of a finely tuned production pipeline. Running without setting a seed may lead to varied results. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-IVw4EgKzqSF" + }, + "outputs": [], + "source": [ + "def set_seeds():\n", + " np.random.seed(121212)\n", + " tf.compat.v1.set_random_seed(212121)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Xrbjmmeom8pA" + }, + "source": [ + "# Fairness Indicators Helper Functions\n", + "Before training our model, we define a number of helper functions that will allow us to evaluate the model's performance via Fairness Indicators.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1EPF_k620CRN" + }, + "source": [ + "First, we create a helper function to save our model once we train it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ejHbhLW5epar" + }, + "outputs": [], + "source": [ + "def save_model(model, subdir):\n", + " base_dir = tempfile.mkdtemp(prefix='saved_models')\n", + " model_location = os.path.join(base_dir, subdir)\n", + " model.save(model_location, save_format='tf')\n", + " return model_location" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "erhKEvqByCNj" + }, + "source": [ + "Next, we define functions used to preprocess the data in order to correctly pass it through to TFMA." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D2qa8Okwj_U3" + }, + "outputs": [], + "source": [ + "#@title Data Preprocessing functions for \n", + "def tfds_filepattern_for_split(dataset_name, split):\n", + " return f\"{local_test_file_full_prefix()}*\"\n", + "\n", + "class PreprocessCelebA(object):\n", + " \"\"\"Class that deserializes, decodes and applies additional preprocessing for CelebA input.\"\"\"\n", + " def __init__(self, dataset_name):\n", + " builder = tfds.builder(dataset_name)\n", + " self.features = builder.info.features\n", + " example_specs = self.features.get_serialized_info()\n", + " self.parser = tfds.core.example_parser.ExampleParser(example_specs)\n", + "\n", + " def __call__(self, serialized_example):\n", + " # Deserialize\n", + " deserialized_example = self.parser.parse_example(serialized_example)\n", + " # Decode\n", + " decoded_example = self.features.decode_example(deserialized_example)\n", + " # Additional preprocessing\n", + " image = decoded_example[IMAGE_KEY]\n", + " label = decoded_example[ATTR_KEY][LABEL_KEY]\n", + " # Resize and scale image.\n", + " image = tf.cast(image, tf.float32)\n", + " image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])\n", + " image /= 255.0\n", + " image = tf.reshape(image, [-1])\n", + " # Cast label and group to float32.\n", + " label = tf.cast(label, tf.float32)\n", + "\n", + " group = decoded_example[ATTR_KEY][GROUP_KEY]\n", + " \n", + " output = tf.train.Example()\n", + " output.features.feature[IMAGE_KEY].float_list.value.extend(image.numpy().tolist())\n", + " output.features.feature[LABEL_KEY].float_list.value.append(label.numpy())\n", + " output.features.feature[GROUP_KEY].bytes_list.value.append(b\"Young\" if group.numpy() else b'Not Young')\n", + " return output.SerializeToString()\n", + "\n", + "def tfds_as_pcollection(beam_pipeline, dataset_name, split):\n", + " return (\n", + " beam_pipeline\n", + " | 'Read records' >> beam.io.ReadFromTFRecord(tfds_filepattern_for_split(dataset_name, split))\n", + " | 'Preprocess' >> beam.Map(PreprocessCelebA(dataset_name))\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fBKvxd2Tz3hK" + }, + "source": [ + "Finally, we define a function that evaluates the results in TFMA." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "30YduitftaNB" + }, + "outputs": [], + "source": [ + "def get_eval_results(model_location, eval_subdir):\n", + " base_dir = tempfile.mkdtemp(prefix='saved_eval_results')\n", + " tfma_eval_result_path = os.path.join(base_dir, eval_subdir)\n", + "\n", + " eval_config_pbtxt = \"\"\"\n", + " model_specs {\n", + " label_key: \"%s\"\n", + " }\n", + " metrics_specs {\n", + " metrics {\n", + " class_name: \"FairnessIndicators\"\n", + " config: '{ \"thresholds\": [0.22, 0.5, 0.75] }'\n", + " }\n", + " metrics {\n", + " class_name: \"ExampleCount\"\n", + " }\n", + " }\n", + " slicing_specs {}\n", + " slicing_specs { feature_keys: \"%s\" }\n", + " options {\n", + " compute_confidence_intervals { value: False }\n", + " disabled_outputs{values: \"analysis\"}\n", + " }\n", + " \"\"\" % (LABEL_KEY, GROUP_KEY)\n", + " \n", + " eval_config = text_format.Parse(eval_config_pbtxt, tfma.EvalConfig())\n", + "\n", + " eval_shared_model = tfma.default_eval_shared_model(\n", + " eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING])\n", + "\n", + " schema_pbtxt = \"\"\"\n", + " tensor_representation_group {\n", + " key: \"\"\n", + " value {\n", + " tensor_representation {\n", + " key: \"%s\"\n", + " value {\n", + " dense_tensor {\n", + " column_name: \"%s\"\n", + " shape {\n", + " dim { size: 28 }\n", + " dim { size: 28 }\n", + " dim { size: 3 }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " feature {\n", + " name: \"%s\"\n", + " type: FLOAT\n", + " }\n", + " feature {\n", + " name: \"%s\"\n", + " type: FLOAT\n", + " }\n", + " feature {\n", + " name: \"%s\"\n", + " type: BYTES\n", + " }\n", + " \"\"\" % (IMAGE_KEY, IMAGE_KEY, IMAGE_KEY, LABEL_KEY, GROUP_KEY)\n", + " schema = text_format.Parse(schema_pbtxt, schema_pb2.Schema())\n", + " coder = tf_example_record.TFExampleBeamRecord(\n", + " physical_format='inmem', schema=schema,\n", + " raw_record_column_name=tfma.ARROW_INPUT_COLUMN)\n", + " tensor_adapter_config = tensor_adapter.TensorAdapterConfig(\n", + " arrow_schema=coder.ArrowSchema(),\n", + " tensor_representations=coder.TensorRepresentations())\n", + " # Run the fairness evaluation.\n", + " with beam.Pipeline() as pipeline:\n", + " _ = (\n", + " tfds_as_pcollection(pipeline, 'celeb_a', 'test')\n", + " | 'ExamplesToRecordBatch' >> coder.BeamSource()\n", + " | 'ExtractEvaluateAndWriteResults' >>\n", + " tfma.ExtractEvaluateAndWriteResults(\n", + " eval_config=eval_config,\n", + " eval_shared_model=eval_shared_model,\n", + " output_path=tfma_eval_result_path,\n", + " tensor_adapter_config=tensor_adapter_config)\n", + " )\n", + " return tfma.load_eval_result(output_path=tfma_eval_result_path)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "76tZ3vk-tyo9" + }, + "source": [ + "# Train & Evaluate Unconstrained Model\n", + "\n", + "With the model now defined and the input pipeline in place, we’re now ready to train our model. To cut back on the amount of execution time and memory, we will train the model by slicing the data into small batches with only a few repeated iterations.\n", + "\n", + "Note that running this notebook in TensorFlow < 2.0.0 may result in a deprecation warning for `np.where`. Safely ignore this warning as TensorFlow addresses this in 2.X by using `tf.where` in place of `np.where`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3m9OOdU_8GWo" + }, + "outputs": [], + "source": [ + "BATCH_SIZE = 32\n", + "\n", + "# Set seeds to get reproducible results\n", + "set_seeds()\n", + "\n", + "model_unconstrained = create_model()\n", + "model_unconstrained.fit(celeb_a_train_data_wo_group(BATCH_SIZE), epochs=5, steps_per_epoch=1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nCtBH9DkvtUy" + }, + "source": [ + "Evaluating the model on the test data should result in a final accuracy score of just over 85%. Not bad for a simple model with no fine tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mgsjbxpTIdZf" + }, + "outputs": [], + "source": [ + "print('Overall Results, Unconstrained')\n", + "celeb_a_test_data = celeb_a_builder.as_dataset(split='test').batch(1).map(preprocess_input_dict).map(get_image_label_and_group)\n", + "results = model_unconstrained.evaluate(celeb_a_test_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L5jslIrzwIKo" + }, + "source": [ + "However, performance evaluated across age groups may reveal some shortcomings.\n", + "\n", + "To explore this further, we evaluate the model with Fairness Indicators (via TFMA). In particular, we are interested in seeing whether there is a significant gap in performance between \"Young\" and \"Not Young\" categories when evaluated on false positive rate.\n", + "\n", + "A false positive error occurs when the model incorrectly predicts the positive class. In this context, a false positive outcome occurs when the ground truth is an image of a celebrity 'Not Smiling' and the model predicts 'Smiling'. By extension, the false positive rate, which is used in the visualization above, is a measure of accuracy for a test. While this is a relatively mundane error to make in this context, false positive errors can sometimes cause more problematic behaviors. For instance, a false positive error in a spam classifier could cause a user to miss an important email." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nFL91nZF1V8D" + }, + "outputs": [], + "source": [ + "model_location = save_model(model_unconstrained, 'model_export_unconstrained')\n", + "eval_results_unconstrained = get_eval_results(model_location, 'eval_results_unconstrained')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "34zHIMW0NHld" + }, + "source": [ + "As mentioned above, we are concentrating on the false positive rate. The current version of Fairness Indicators (0.1.2) selects false negative rate by default. After running the line below, deselect false_negative_rate and select false_positive_rate to look at the metric we are interested in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KXMVmUMi0ydk" + }, + "outputs": [], + "source": [ + "tfma.addons.fairness.view.widget_view.render_fairness_indicator(eval_results_unconstrained)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zYVpZ-DpBsfD" + }, + "source": [ + "As the results show above, we do see a **disproportionate gap between \"Young\" and \"Not Young\" categories**.\n", + "\n", + "This is where TFCO can help by constraining the false positive rate to be within a more acceptable criterion.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZNnI_Eu70gVp" + }, + "source": [ + "# Constrained Model Set Up\n", + "As documented in [TFCO's library](https://github.com/google-research/tensorflow_constrained_optimization/blob/master/README.md), there are several helpers that will make it easier to constrain the problem:\n", + "\n", + "1. `tfco.rate_context()` – This is what will be used in constructing a constraint for each age group category.\n", + "2. `tfco.RateMinimizationProblem()`– The rate expression to be minimized here will be the false positive rate subject to age group. In other words, performance now will be evaluated based on the difference between the false positive rates of the age group and that of the overall dataset. For this demonstration, a false positive rate of less than or equal to 5% will be set as the constraint.\n", + "3. `tfco.ProxyLagrangianOptimizerV2()` – This is the helper that will actually solve the rate constraint problem.\n", + "\n", + "The cell below will call on these helpers to set up model training with the fairness constraint.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BTukzvfD6iWr" + }, + "outputs": [], + "source": [ + "# The batch size is needed to create the input, labels and group tensors.\n", + "# These tensors are initialized with all 0's. They will eventually be assigned\n", + "# the batch content to them. A large batch size is chosen so that there are\n", + "# enough number of \"Young\" and \"Not Young\" examples in each batch.\n", + "set_seeds()\n", + "model_constrained = create_model()\n", + "BATCH_SIZE = 32\n", + "\n", + "# Create input tensor.\n", + "input_tensor = tf.Variable(\n", + " np.zeros((BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3), dtype=\"float32\"),\n", + " name=\"input\")\n", + "\n", + "# Create labels and group tensors (assuming both labels and groups are binary).\n", + "labels_tensor = tf.Variable(\n", + " np.zeros(BATCH_SIZE, dtype=\"float32\"), name=\"labels\")\n", + "groups_tensor = tf.Variable(\n", + " np.zeros(BATCH_SIZE, dtype=\"float32\"), name=\"groups\")\n", + "\n", + "# Create a function that returns the applied 'model' to the input tensor\n", + "# and generates constrained predictions.\n", + "def predictions():\n", + " return model_constrained(input_tensor)\n", + "\n", + "# Create overall context and subsetted context.\n", + "# The subsetted context contains subset of examples where group attribute < 1\n", + "# (i.e. the subset of \"Not Young\" celebrity images).\n", + "# \"groups_tensor < 1\" is used instead of \"groups_tensor == 0\" as the former\n", + "# would be a comparison on the tensor value, while the latter would be a\n", + "# comparison on the Tensor object.\n", + "context = tfco.rate_context(predictions, labels=lambda:labels_tensor)\n", + "context_subset = context.subset(lambda:groups_tensor < 1)\n", + "\n", + "# Setup list of constraints.\n", + "# In this notebook, the constraint will just be: FPR to less or equal to 5%.\n", + "constraints = [tfco.false_positive_rate(context_subset) <= 0.05]\n", + "\n", + "# Setup rate minimization problem: minimize overall error rate s.t. constraints.\n", + "problem = tfco.RateMinimizationProblem(tfco.error_rate(context), constraints)\n", + "\n", + "# Create constrained optimizer and obtain train_op.\n", + "# Separate optimizers are specified for the objective and constraints\n", + "optimizer = tfco.ProxyLagrangianOptimizerV2(\n", + " optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),\n", + " constraint_optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),\n", + " num_constraints=problem.num_constraints)\n", + "\n", + "# A list of all trainable variables is also needed to use TFCO.\n", + "var_list = (model_constrained.trainable_weights + list(problem.trainable_variables) +\n", + " optimizer.trainable_variables())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "thEe8A8UYbrO" + }, + "source": [ + "The model is now set up and ready to be trained with the false positive rate constraint across age group.\n", + "\n", + "Now, because the last iteration of the constrained model may not necessarily be the best performing model in terms of the defined constraint, the TFCO library comes equipped with `tfco.find_best_candidate_index()` that can help choose the best iterate out of the ones found after each epoch. Think of `tfco.find_best_candidate_index()` as an added heuristic that ranks each of the outcomes based on accuracy and fairness constraint (in this case, false positive rate across age group) separately with respect to the training data. That way, it can search for a better trade-off between overall accuracy and the fairness constraint.\n", + "\n", + "The following cells will start the training with constraints while also finding the best performing model per iteration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "73doG4HL6nPS" + }, + "outputs": [], + "source": [ + "# Obtain train set batches.\n", + "\n", + "NUM_ITERATIONS = 100 # Number of training iterations.\n", + "SKIP_ITERATIONS = 10 # Print training stats once in this many iterations.\n", + "\n", + "# Create temp directory for saving snapshots of models.\n", + "temp_directory = tempfile.mktemp()\n", + "os.mkdir(temp_directory)\n", + "\n", + "# List of objective and constraints across iterations.\n", + "objective_list = []\n", + "violations_list = []\n", + "\n", + "# Training iterations.\n", + "iteration_count = 0\n", + "for (image, label, group) in celeb_a_train_data_w_group(BATCH_SIZE):\n", + " # Assign current batch to input, labels and groups tensors.\n", + " input_tensor.assign(image)\n", + " labels_tensor.assign(label)\n", + " groups_tensor.assign(group)\n", + "\n", + " # Run gradient update.\n", + " optimizer.minimize(problem, var_list=var_list)\n", + "\n", + " # Record objective and violations.\n", + " objective = problem.objective()\n", + " violations = problem.constraints()\n", + "\n", + " sys.stdout.write(\n", + " \"\\r Iteration %d: Hinge Loss = %.3f, Max. Constraint Violation = %.3f\"\n", + " % (iteration_count + 1, objective, max(violations)))\n", + "\n", + " # Snapshot model once in SKIP_ITERATIONS iterations.\n", + " if iteration_count % SKIP_ITERATIONS == 0:\n", + " objective_list.append(objective)\n", + " violations_list.append(violations)\n", + "\n", + " # Save snapshot of model weights.\n", + " model_constrained.save_weights(\n", + " temp_directory + \"/celeb_a_constrained_\" +\n", + " str(iteration_count / SKIP_ITERATIONS) + \".h5\")\n", + "\n", + " iteration_count += 1\n", + " if iteration_count >= NUM_ITERATIONS:\n", + " break\n", + "\n", + "# Choose best model from recorded iterates and load that model.\n", + "best_index = tfco.find_best_candidate_index(\n", + " np.array(objective_list), np.array(violations_list))\n", + "\n", + "model_constrained.load_weights(\n", + " temp_directory + \"/celeb_a_constrained_\" + str(best_index) + \".0.h5\")\n", + "\n", + "# Remove temp directory.\n", + "os.system(\"rm -r \" + temp_directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6r-6_R_gSrsT" + }, + "source": [ + "After having applied the constraint, we evaluate the results once again using Fairness Indicators." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5G6B3OR9CUmo" + }, + "outputs": [], + "source": [ + "model_location = save_model(model_constrained, 'model_export_constrained')\n", + "eval_result_constrained = get_eval_results(model_location, 'eval_results_constrained')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sVteOnE80ATS" + }, + "source": [ + "As with the previous time we used Fairness Indicators, deselect false_negative_rate and select false_positive_rate to look at the metric we are interested in.\n", + "\n", + "Note that to fairly compare the two versions of our model, it is important to use thresholds that set the overall false positive rate to be roughly equal. This ensures that we are looking at actual change as opposed to just a shift in the model equivalent to simply moving the threshold boundary. In our case, comparing the unconstrained model at 0.5 and the constrained model at 0.22 provides a fair comparison for the models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GRIjYftvuc7b" + }, + "outputs": [], + "source": [ + "eval_results_dict = {\n", + " 'constrained': eval_result_constrained,\n", + " 'unconstrained': eval_results_unconstrained,\n", + "}\n", + "tfma.addons.fairness.view.widget_view.render_fairness_indicator(multi_eval_results=eval_results_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lrT-7EBrcBvV" + }, + "source": [ + "With TFCO's ability to express a more complex requirement as a rate constraint, we helped this model achieve a more desirable outcome with little impact to the overall performance. There is, of course, still room for improvement, but at least TFCO was able to find a model that gets close to satisfying the constraint and reduces the disparity between the groups as much as possible." + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Fairness Indicators TFCO CelebA Case Study.ipynb", + "private_outputs": true, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb b/docs/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb new file mode 100644 index 00000000..4487d594 --- /dev/null +++ b/docs/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb @@ -0,0 +1,1133 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "jMqk3Z8EciF8" + }, + "source": [ + "##### Copyright 2020 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XbpNOB-vJVKu" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bqdaOVRxWs8v" + }, + "source": [ + "# Wiki Talk Comments Toxicity Prediction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EG_KEDkodWsT" + }, + "source": [ + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y6T5tlXcdW7J" + }, + "source": [ + "In this example, we consider the task of predicting whether a discussion comment posted on a Wiki talk page contains toxic content (i.e. contains content that is “rude, disrespectful or unreasonable”). We use a public dataset released by the Conversation AI project, which contains over 100k comments from the English Wikipedia that are annotated by crowd workers (see [paper](https://arxiv.org/pdf/1610.08914.pdf) for labeling methodology).\n", + "\n", + "One of the challenges with this dataset is that a very small proportion of the comments cover sensitive topics such as sexuality or religion. As such, training a neural network model on this dataset leads to disparate performance on the smaller sensitive topics. This can mean that innocuous statements about those topics might get incorrectly flagged as ‘toxic’ at higher rates, causing speech to be unfairly censored\n", + "\n", + "By imposing constraints during training, we can train a *fairer* model that performs more equitably across the different topic groups. \n", + "\n", + "We will use the TFCO library to optimize for our fairness goal during training." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DG_C2gsAKV7x" + }, + "source": [ + "## Installation\n", + "\n", + "Let's first install and import the relevant libraries. Note that you may have to restart your colab once after running the first cell because of outdated packages in the runtime. After doing so, there should be no further issues with imports." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0XOLn8Pyrc_s" + }, + "outputs": [], + "source": [ + "#@title pip installs\n", + "!pip install git+https://github.com/google-research/tensorflow_constrained_optimization\n", + "!pip install git+https://github.com/tensorflow/fairness-indicators" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2ZkQDo2xcDXU" + }, + "source": [ + "Note that depending on when you run the cell below, you may receive a warning about the default version of TensorFlow in Colab switching to TensorFlow 2.X soon. You can safely ignore that warning as this notebook was designed to be compatible with TensorFlow 1.X and 2.X." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "nd_Y6CTnWs8w" + }, + "outputs": [], + "source": [ + "#@title Import Modules\n", + "import io\n", + "import os\n", + "import shutil\n", + "import sys\n", + "import tempfile\n", + "import time\n", + "import urllib\n", + "import zipfile\n", + "\n", + "import apache_beam as beam\n", + "from IPython.display import display\n", + "from IPython.display import HTML\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import tensorflow as tf\n", + "import tensorflow.keras as keras\n", + "from tensorflow.keras import layers\n", + "from tensorflow.keras.preprocessing import sequence\n", + "from tensorflow.keras.preprocessing import text\n", + "import tensorflow_constrained_optimization as tfco\n", + "import tensorflow_model_analysis as tfma\n", + "import fairness_indicators as fi\n", + "from tensorflow_model_analysis.addons.fairness.view import widget_view\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_evaluate_graph\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_extractor\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_predict as agnostic_predict" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GvqR564dLEVa" + }, + "source": [ + "Though TFCO is compatible with eager and graph execution, this notebook assumes that eager execution is enabled by default. To ensure that nothing breaks, eager execution will be enabled in the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "avMBqzjWct4Z" + }, + "outputs": [], + "source": [ + "#@title Enable Eager Execution and Print Versions\n", + "if tf.__version__ < \"2.0.0\":\n", + " tf.enable_eager_execution()\n", + " print(\"Eager execution enabled.\")\n", + "else:\n", + " print(\"Eager execution enabled by default.\")\n", + "\n", + "print(\"TensorFlow \" + tf.__version__)\n", + "print(\"TFMA \" + tfma.__version__)\n", + "print(\"FI \" + fi.version.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YUJyWaAwWs83" + }, + "source": [ + "## Hyper-parameters\n", + "\n", + "First, we set some hyper-parameters needed for the data preprocessing and model training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1aXlwlqTWs84" + }, + "outputs": [], + "source": [ + "hparams = {\n", + " \"batch_size\": 128,\n", + " \"cnn_filter_sizes\": [128, 128, 128],\n", + " \"cnn_kernel_sizes\": [5, 5, 5],\n", + " \"cnn_pooling_sizes\": [5, 5, 40],\n", + " \"constraint_learning_rate\": 0.01,\n", + " \"embedding_dim\": 100,\n", + " \"embedding_trainable\": False,\n", + " \"learning_rate\": 0.005,\n", + " \"max_num_words\": 10000,\n", + " \"max_sequence_length\": 250\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0PMs8Iwxq98C" + }, + "source": [ + "## Load and pre-process dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DIe2JRDeWs87" + }, + "source": [ + "Next, we download the dataset and preprocess it. The train, test and validation sets are provided as separate CSV files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rcd2CV7pWs88" + }, + "outputs": [], + "source": [ + "toxicity_data_url = (\"https://github.com/conversationai/unintended-ml-bias-analysis/\"\n", + " \"raw/e02b9f12b63a39235e57ba6d3d62d8139ca5572c/data/\")\n", + "\n", + "data_train = pd.read_csv(toxicity_data_url + \"wiki_train.csv\")\n", + "data_test = pd.read_csv(toxicity_data_url + \"wiki_test.csv\")\n", + "data_vali = pd.read_csv(toxicity_data_url + \"wiki_dev.csv\")\n", + "\n", + "data_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ojo617RIWs8_" + }, + "source": [ + "The `comment` column contains the discussion comments and `is_toxic` column indicates whether or not a comment is annotated as toxic. \n", + "\n", + "In the following, we:\n", + "1. Separate out the labels\n", + "2. Tokenize the text comments\n", + "3. Identify comments that contain sensitive topic terms \n", + "\n", + "First, we separate the labels from the train, test and validation sets. The labels are all binary (0 or 1)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mxo7ny90Ws9A" + }, + "outputs": [], + "source": [ + "labels_train = data_train[\"is_toxic\"].values.reshape(-1, 1) * 1.0\n", + "labels_test = data_test[\"is_toxic\"].values.reshape(-1, 1) * 1.0\n", + "labels_vali = data_vali[\"is_toxic\"].values.reshape(-1, 1) * 1.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "alrWi6jUWs9C" + }, + "source": [ + "Next, we tokenize the textual comments using the `Tokenizer` provided by `Keras`. We use the training set comments alone to build a vocabulary of tokens, and use them to convert all the comments into a (padded) sequence of tokens of the same length." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yvOTBsrHWs9D" + }, + "outputs": [], + "source": [ + "tokenizer = text.Tokenizer(num_words=hparams[\"max_num_words\"])\n", + "tokenizer.fit_on_texts(data_train[\"comment\"])\n", + "\n", + "def prep_text(texts, tokenizer, max_sequence_length):\n", + " # Turns text into into padded sequences.\n", + " text_sequences = tokenizer.texts_to_sequences(texts)\n", + " return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length)\n", + "\n", + "text_train = prep_text(data_train[\"comment\"], tokenizer, hparams[\"max_sequence_length\"])\n", + "text_test = prep_text(data_test[\"comment\"], tokenizer, hparams[\"max_sequence_length\"])\n", + "text_vali = prep_text(data_vali[\"comment\"], tokenizer, hparams[\"max_sequence_length\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Cn5zbgp-Ws9F" + }, + "source": [ + "Finally, we identify comments related to certain sensitive topic groups. We consider a subset of the identity terms provided with the dataset and group them into\n", + "four broad topic groups: *sexuality*, *gender identity*, *religion*, and *race*." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EnFfV2gEWs9G" + }, + "outputs": [], + "source": [ + "terms = {\n", + " 'sexuality': ['gay', 'lesbian', 'bisexual', 'homosexual', 'straight', 'heterosexual'], \n", + " 'gender identity': ['trans', 'transgender', 'cis', 'nonbinary'],\n", + " 'religion': ['christian', 'muslim', 'jewish', 'buddhist', 'catholic', 'protestant', 'sikh', 'taoist'],\n", + " 'race': ['african', 'african american', 'black', 'white', 'european', 'hispanic', 'latino', 'latina', \n", + " 'latinx', 'mexican', 'canadian', 'american', 'asian', 'indian', 'middle eastern', 'chinese', \n", + " 'japanese']}\n", + "\n", + "group_names = list(terms.keys())\n", + "num_groups = len(group_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ooI3F5M4Ws9I" + }, + "source": [ + "We then create separate group membership matrices for the train, test and validation sets, where the rows correspond to comments, the columns correspond to the four sensitive groups, and each entry is a boolean indicating whether the comment contains a term from the topic group." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zO7PyNckWs9J" + }, + "outputs": [], + "source": [ + "def get_groups(text):\n", + " # Returns a boolean NumPy array of shape (n, k), where n is the number of comments, \n", + " # and k is the number of groups. Each entry (i, j) indicates if the i-th comment \n", + " # contains a term from the j-th group.\n", + " groups = np.zeros((text.shape[0], num_groups))\n", + " for ii in range(num_groups):\n", + " groups[:, ii] = text.str.contains('|'.join(terms[group_names[ii]]), case=False)\n", + " return groups\n", + "\n", + "groups_train = get_groups(data_train[\"comment\"])\n", + "groups_test = get_groups(data_test[\"comment\"])\n", + "groups_vali = get_groups(data_vali[\"comment\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GFAI6AB9Ws9L" + }, + "source": [ + "As shown below, all four topic groups constitute only a small fraction of the overall dataset, and have varying proportions of toxic comments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8Ug4u_P9Ws9M" + }, + "outputs": [], + "source": [ + "print(\"Overall label proportion = %.1f%%\" % (labels_train.mean() * 100))\n", + "\n", + "group_stats = []\n", + "for ii in range(num_groups):\n", + " group_proportion = groups_train[:, ii].mean()\n", + " group_pos_proportion = labels_train[groups_train[:, ii] == 1].mean()\n", + " group_stats.append([group_names[ii],\n", + " \"%.2f%%\" % (group_proportion * 100), \n", + " \"%.1f%%\" % (group_pos_proportion * 100)])\n", + "group_stats = pd.DataFrame(group_stats, \n", + " columns=[\"Topic group\", \"Group proportion\", \"Label proportion\"])\n", + "group_stats" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aG5ZKKrVWs9O" + }, + "source": [ + "We see that only 1.3% of the dataset contains comments related to sexuality. Among them, 37% of the comments have been annotated as being toxic. Note that this is significantly larger than the overall proportion of comments annotated as toxic. This could be because the few comments that used those identity terms did so in pejorative contexts. As mentioned above, this could cause our model to disporportionately misclassify comments as toxic when they include those terms. Since this is the concern, we'll make sure to look at the **False Positive Rate** when we evaluate the model's performance." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5DkJpKaLWs9P" + }, + "source": [ + "## Build CNN toxicity prediction model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "niJ4KIJgWs9Q" + }, + "source": [ + "Having prepared the dataset, we now build a `Keras` model for prediction toxicity. The model we use is a convolutional neural network (CNN) with the same architecture used by the Conversation AI project for their debiasing analysis. We adapt code provided by them to construct the model layers.\n", + "\n", + "The model uses an embedding layer to convert the text tokens to fixed-length vectors. This layer converts the input text sequence into a sequence of vectors, and passes them through several layers of convolution and pooling operations, followed by a final fully-connected layer.\n", + "\n", + "We make use of pre-trained GloVe word vector embeddings, which we download below. This may take a few minutes to complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yevbBL2oWs9Q" + }, + "outputs": [], + "source": [ + "zip_file_url = \"http://nlp.stanford.edu/data/glove.6B.zip\"\n", + "zip_file = urllib.request.urlopen(zip_file_url)\n", + "archive = zipfile.ZipFile(io.BytesIO(zip_file.read()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a11-YWDnWs9S" + }, + "source": [ + "We use the downloaded GloVe embeddings to create an embedding matrix, where the rows contain the word embeddings for the tokens in the `Tokenizer`'s vocabulary. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bBS74MMYWs9T" + }, + "outputs": [], + "source": [ + "embeddings_index = {}\n", + "glove_file = \"glove.6B.100d.txt\"\n", + "\n", + "with archive.open(glove_file) as f:\n", + " for line in f:\n", + " values = line.split()\n", + " word = values[0].decode(\"utf-8\") \n", + " coefs = np.asarray(values[1:], dtype=\"float32\")\n", + " embeddings_index[word] = coefs\n", + "\n", + "embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, hparams[\"embedding_dim\"]))\n", + "num_words_in_embedding = 0\n", + "for word, i in tokenizer.word_index.items():\n", + " embedding_vector = embeddings_index.get(word)\n", + " if embedding_vector is not None:\n", + " num_words_in_embedding += 1\n", + " embedding_matrix[i] = embedding_vector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t9NVp-_eWs9V" + }, + "source": [ + "We are now ready to specify the `Keras` layers. We write a function to create a new model, which we will invoke whenever we wish to train a new model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_f_DhA6OWs9W" + }, + "outputs": [], + "source": [ + "def create_model():\n", + " model = keras.Sequential()\n", + "\n", + " # Embedding layer.\n", + " embedding_layer = layers.Embedding(\n", + " embedding_matrix.shape[0],\n", + " embedding_matrix.shape[1],\n", + " weights=[embedding_matrix],\n", + " input_length=hparams[\"max_sequence_length\"],\n", + " trainable=hparams['embedding_trainable'])\n", + " model.add(embedding_layer)\n", + "\n", + " # Convolution layers.\n", + " for filter_size, kernel_size, pool_size in zip(\n", + " hparams['cnn_filter_sizes'], hparams['cnn_kernel_sizes'],\n", + " hparams['cnn_pooling_sizes']):\n", + "\n", + " conv_layer = layers.Conv1D(\n", + " filter_size, kernel_size, activation='relu', padding='same')\n", + " model.add(conv_layer)\n", + "\n", + " pooled_layer = layers.MaxPooling1D(pool_size, padding='same')\n", + " model.add(pooled_layer)\n", + "\n", + " # Add a flatten layer, a fully-connected layer and an output layer.\n", + " model.add(layers.Flatten())\n", + " model.add(layers.Dense(128, activation='relu'))\n", + " model.add(layers.Dense(1))\n", + " \n", + " return model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CwcqYITBN7bW" + }, + "source": [ + "We also define a method to set random seeds. This is done to ensure reproducible results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C_1nsXntN98C" + }, + "outputs": [], + "source": [ + "def set_seeds():\n", + " np.random.seed(121212)\n", + " tf.compat.v1.set_random_seed(212121)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "X-_fKjDtWs9Y" + }, + "source": [ + "## Fairness indicators" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k009haGaWs9Z" + }, + "source": [ + "We also write functions to plot fairness indicators." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B9ZgGCAs8V-I" + }, + "outputs": [], + "source": [ + "def create_examples(labels, predictions, groups, group_names):\n", + " # Returns tf.examples with given labels, predictions, and group information. \n", + " examples = []\n", + " sigmoid = lambda x: 1/(1 + np.exp(-x)) \n", + " for ii in range(labels.shape[0]):\n", + " example = tf.train.Example()\n", + " example.features.feature['toxicity'].float_list.value.append(\n", + " labels[ii][0])\n", + " example.features.feature['prediction'].float_list.value.append(\n", + " sigmoid(predictions[ii][0])) # predictions need to be in [0, 1].\n", + " for jj in range(groups.shape[1]):\n", + " example.features.feature[group_names[jj]].bytes_list.value.append(\n", + " b'Yes' if groups[ii, jj] else b'No')\n", + " examples.append(example)\n", + " return examples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vESL-3dU9iiG" + }, + "outputs": [], + "source": [ + "def evaluate_results(labels, predictions, groups, group_names):\n", + " # Evaluates fairness indicators for given labels, predictions and group\n", + " # membership info.\n", + " examples = create_examples(labels, predictions, groups, group_names)\n", + "\n", + " # Create feature map for labels, predictions and each group.\n", + " feature_map = {\n", + " 'prediction': tf.io.FixedLenFeature([], tf.float32),\n", + " 'toxicity': tf.io.FixedLenFeature([], tf.float32),\n", + " }\n", + " for group in group_names:\n", + " feature_map[group] = tf.io.FixedLenFeature([], tf.string)\n", + "\n", + " # Serialize the examples.\n", + " serialized_examples = [e.SerializeToString() for e in examples]\n", + "\n", + " BASE_DIR = tempfile.gettempdir()\n", + " OUTPUT_DIR = os.path.join(BASE_DIR, 'output')\n", + "\n", + " with beam.Pipeline() as pipeline:\n", + " model_agnostic_config = agnostic_predict.ModelAgnosticConfig(\n", + " label_keys=['toxicity'],\n", + " prediction_keys=['prediction'],\n", + " feature_spec=feature_map)\n", + " \n", + " slices = [tfma.slicer.SingleSliceSpec()]\n", + " for group in group_names:\n", + " slices.append(\n", + " tfma.slicer.SingleSliceSpec(columns=[group]))\n", + "\n", + " extractors = [\n", + " model_agnostic_extractor.ModelAgnosticExtractor(\n", + " model_agnostic_config=model_agnostic_config),\n", + " tfma.extractors.slice_key_extractor.SliceKeyExtractor(slices)\n", + " ]\n", + "\n", + " metrics_callbacks = [\n", + " tfma.post_export_metrics.fairness_indicators(\n", + " thresholds=[0.5],\n", + " target_prediction_keys=['prediction'],\n", + " labels_key='toxicity'),\n", + " tfma.post_export_metrics.example_count()]\n", + "\n", + " # Create a model agnostic aggregator.\n", + " eval_shared_model = tfma.types.EvalSharedModel(\n", + " add_metrics_callbacks=metrics_callbacks,\n", + " construct_fn=model_agnostic_evaluate_graph.make_construct_fn(\n", + " add_metrics_callbacks=metrics_callbacks,\n", + " config=model_agnostic_config))\n", + "\n", + " # Run Model Agnostic Eval.\n", + " _ = (\n", + " pipeline\n", + " | beam.Create(serialized_examples)\n", + " | 'ExtractEvaluateAndWriteResults' >>\n", + " tfma.ExtractEvaluateAndWriteResults(\n", + " eval_shared_model=eval_shared_model,\n", + " output_path=OUTPUT_DIR,\n", + " extractors=extractors,\n", + " compute_confidence_intervals=True\n", + " )\n", + " )\n", + "\n", + " fairness_ind_result = tfma.load_eval_result(output_path=OUTPUT_DIR)\n", + "\n", + " # Also evaluate accuracy of the model.\n", + " accuracy = np.mean(labels == (predictions > 0.0))\n", + "\n", + " return fairness_ind_result, accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W3Sp7mpsWs9f" + }, + "outputs": [], + "source": [ + "def plot_fairness_indicators(eval_result, title):\n", + " fairness_ind_result, accuracy = eval_result\n", + " display(HTML(\"

\" + title + \n", + " \" (Accuracy = %.2f%%)\" % (accuracy * 100) + \"

\"))\n", + " widget_view.render_fairness_indicator(fairness_ind_result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WqLdtgI42fxb" + }, + "outputs": [], + "source": [ + "def plot_multi_fairness_indicators(multi_eval_results):\n", + " \n", + " multi_results = {}\n", + " multi_accuracy = {}\n", + " for title, (fairness_ind_result, accuracy) in multi_eval_results.items():\n", + " multi_results[title] = fairness_ind_result\n", + " multi_accuracy[title] = accuracy\n", + " \n", + " title_str = \"

\"\n", + " for title in multi_eval_results.keys():\n", + " title_str+=title + \" (Accuracy = %.2f%%)\" % (multi_accuracy[title] * 100) + \"; \"\n", + " title_str=title_str[:-2]\n", + " title_str+=\"

\"\n", + " # fairness_ind_result, accuracy = eval_result\n", + " display(HTML(title_str))\n", + " widget_view.render_fairness_indicator(multi_eval_results=multi_results)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8aWNc4CdWs9h" + }, + "source": [ + "## Train unconstrained model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DuSA8qL7Ws9i" + }, + "source": [ + "For the first model we train, we optimize a simple cross-entropy loss *without* any constraints.." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0g50bauHWs9j" + }, + "outputs": [], + "source": [ + "# Set random seed for reproducible results.\n", + "set_seeds()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YsCoHMG_iIzc" + }, + "source": [ + "**Note**: The following code cell can take ~8 minutes to run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tamJiG3FiDYW" + }, + "outputs": [], + "source": [ + "# Optimizer and loss.\n", + "optimizer = tf.keras.optimizers.Adam(learning_rate=hparams[\"learning_rate\"])\n", + "loss = lambda y_true, y_pred: tf.keras.losses.binary_crossentropy(\n", + " y_true, y_pred, from_logits=True)\n", + "\n", + "# Create, compile and fit model.\n", + "model_unconstrained = create_model()\n", + "model_unconstrained.compile(optimizer=optimizer, loss=loss)\n", + "\n", + "model_unconstrained.fit(\n", + " x=text_train, y=labels_train, batch_size=hparams[\"batch_size\"], epochs=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p7AvIdktWs9t" + }, + "source": [ + "Having trained the unconstrained model, we plot various evaluation metrics for the model on the test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tHV40_21lRL6" + }, + "outputs": [], + "source": [ + "scores_unconstrained_test = model_unconstrained.predict(text_test)\n", + "eval_result_unconstrained = evaluate_results(\n", + " labels_test, scores_unconstrained_test, groups_test, group_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AJpRuN0EOeyG" + }, + "source": [ + "As explained above, we are concentrating on the false positive rate. In their current version (0.1.2), Fairness Indicators select false negative rate by default. After running the line below, go ahead and deselect false_negative_rate and select false_positive_rate to look at the metric we are interested in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2fwNpfou4yvP" + }, + "outputs": [], + "source": [ + "plot_fairness_indicators(eval_result_unconstrained, \"Unconstrained\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J3TbAenkGM7P" + }, + "source": [ + "While the overall false positive rate is less than 2%, the false positive rate on the sexuality-related comments is significantly higher. This is because the sexuality group is very small in size, and has a disproportionately higher fraction of comments annotated as toxic. Hence, training a model without constraints results in the model believing that sexuality-related terms are a strong indicator of toxicity." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KmxyAo9hWs9w" + }, + "source": [ + "## Train with constraints on false positive rates" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l3dYUchIWs9w" + }, + "source": [ + "To avoid large differences in false positive rates across different groups, we \n", + "next train a model by constraining the false positive rates for each group to be within a desired limit. In this case, we will optimize the error rate of the model subject to the *per-group false positive rates being lesser or equal to 2%*.\n", + "\n", + "Training on minibatches with per-group constraints can be challenging for this dataset, however, as the groups we wish to constraint are all small in size, and it's likely that the individual minibatches contain very few examples from each group. Hence the gradients we compute during training will be noisy, and result in the model converging very slowly. \n", + "\n", + "To mitigate this problem, we recommend using two streams of minibatches, with the first stream formed as before from the entire training set, and the second stream formed solely from the sensitive group examples. We will compute the objective using minibatches from the first stream and the per-group constraints using minibatches from the second stream. Because the batches from the second stream are likely to contain a larger number of examples from each group, we expect our updates to be less noisy.\n", + "\n", + "We create separate features, labels and groups tensors to hold the minibatches from the two streams." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vMuuTOEOWs9x" + }, + "outputs": [], + "source": [ + "# Set random seed.\n", + "set_seeds()\n", + "\n", + "# Features tensors.\n", + "batch_shape = (hparams[\"batch_size\"], hparams['max_sequence_length'])\n", + "features_tensor = tf.Variable(np.zeros(batch_shape, dtype='int32'), name='x')\n", + "features_tensor_sen = tf.Variable(np.zeros(batch_shape, dtype='int32'), name='x_sen')\n", + "\n", + "# Labels tensors.\n", + "batch_shape = (hparams[\"batch_size\"], 1)\n", + "labels_tensor = tf.Variable(np.zeros(batch_shape, dtype='float32'), name='labels')\n", + "labels_tensor_sen = tf.Variable(np.zeros(batch_shape, dtype='float32'), name='labels_sen')\n", + "\n", + "# Groups tensors.\n", + "batch_shape = (hparams[\"batch_size\"], num_groups)\n", + "groups_tensor_sen = tf.Variable(np.zeros(batch_shape, dtype='float32'), name='groups_sen')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-wh26V7nWs9z" + }, + "source": [ + "We instantiate a new model, and compute predictions for minibatches from the two streams." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kawyrkQIWs9z" + }, + "outputs": [], + "source": [ + "# Create model, and separate prediction functions for the two streams. \n", + "# For the predictions, we use a nullary function returning a Tensor to support eager mode.\n", + "model_constrained = create_model()\n", + "\n", + "def predictions():\n", + " return model_constrained(features_tensor)\n", + "\n", + "def predictions_sen():\n", + " return model_constrained(features_tensor_sen)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UG9t7dw1Ws91" + }, + "source": [ + "We then set up a constrained optimization problem with the error rate as the objective and with constraints on the per-group false positive rate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EhKAMGSJWs93" + }, + "outputs": [], + "source": [ + "epsilon = 0.02 # Desired false-positive rate threshold.\n", + "\n", + "# Set up separate contexts for the two minibatch streams.\n", + "context = tfco.rate_context(predictions, lambda:labels_tensor)\n", + "context_sen = tfco.rate_context(predictions_sen, lambda:labels_tensor_sen)\n", + "\n", + "# Compute the objective using the first stream.\n", + "objective = tfco.error_rate(context)\n", + "\n", + "# Compute the constraint using the second stream.\n", + "# Subset the examples belonging to the \"sexuality\" group from the second stream \n", + "# and add a constraint on the group's false positive rate.\n", + "context_sen_subset = context_sen.subset(lambda: groups_tensor_sen[:, 0] > 0)\n", + "constraint = [tfco.false_positive_rate(context_sen_subset) <= epsilon]\n", + "\n", + "# Create a rate minimization problem.\n", + "problem = tfco.RateMinimizationProblem(objective, constraint)\n", + "\n", + "# Set up a constrained optimizer.\n", + "optimizer = tfco.ProxyLagrangianOptimizerV2(\n", + " optimizer=tf.keras.optimizers.Adam(learning_rate=hparams[\"learning_rate\"]),\n", + " num_constraints=problem.num_constraints)\n", + "\n", + "# List of variables to optimize include the model weights, \n", + "# and the trainable variables from the rate minimization problem and \n", + "# the constrained optimizer.\n", + "var_list = (model_constrained.trainable_weights + list(problem.trainable_variables) +\n", + " optimizer.trainable_variables())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CoFWd8wMWs94" + }, + "source": [ + "We are ready to train the model. We maintain a separate counter for the two minibatch streams. Every time we perform a gradient update, we will have to copy the minibatch contents from the first stream to the tensors `features_tensor` and `labels_tensor`, and the minibatch contents from the second stream to the tensors `features_tensor_sen`, `labels_tensor_sen` and `groups_tensor_sen`.\n", + "\n", + "**Note**: The following code cell may take ~12 minutes to run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zbXohC6vWs95" + }, + "outputs": [], + "source": [ + "# Indices of sensitive group members.\n", + "protected_group_indices = np.nonzero(groups_train.sum(axis=1))[0]\n", + "\n", + "num_examples = text_train.shape[0]\n", + "num_examples_sen = protected_group_indices.shape[0]\n", + "batch_size = hparams[\"batch_size\"]\n", + "\n", + "# Number of steps needed for one epoch over the training sample.\n", + "num_steps = int(num_examples / batch_size)\n", + "\n", + "start_time = time.time()\n", + "\n", + "# Loop over minibatches.\n", + "for batch_index in range(num_steps):\n", + " # Indices for current minibatch in the first stream.\n", + " batch_indices = np.arange(\n", + " batch_index * batch_size, (batch_index + 1) * batch_size)\n", + " batch_indices = [ind % num_examples for ind in batch_indices]\n", + "\n", + " # Indices for current minibatch in the second stream.\n", + " batch_indices_sen = np.arange(\n", + " batch_index * batch_size, (batch_index + 1) * batch_size)\n", + " batch_indices_sen = [protected_group_indices[ind % num_examples_sen]\n", + " for ind in batch_indices_sen]\n", + "\n", + " # Assign features, labels, groups from the minibatches to the respective tensors.\n", + " features_tensor.assign(text_train[batch_indices, :])\n", + " labels_tensor.assign(labels_train[batch_indices])\n", + "\n", + " features_tensor_sen.assign(text_train[batch_indices_sen, :])\n", + " labels_tensor_sen.assign(labels_train[batch_indices_sen])\n", + " groups_tensor_sen.assign(groups_train[batch_indices_sen, :])\n", + "\n", + " # Gradient update.\n", + " optimizer.minimize(problem, var_list=var_list)\n", + " \n", + " # Record and print batch training stats every 10 steps.\n", + " if (batch_index + 1) % 10 == 0 or batch_index in (0, num_steps - 1):\n", + " hinge_loss = problem.objective()\n", + " max_violation = max(problem.constraints())\n", + "\n", + " elapsed_time = time.time() - start_time\n", + " sys.stdout.write(\n", + " \"\\rStep %d / %d: Elapsed time = %ds, Loss = %.3f, Violation = %.3f\" % \n", + " (batch_index + 1, num_steps, elapsed_time, hinge_loss, max_violation))\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DdJfplDpWs97" + }, + "source": [ + "Having trained the constrained model, we plot various evaluation metrics for the model on the test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jEerPEwLhfTN" + }, + "outputs": [], + "source": [ + "scores_constrained_test = model_constrained.predict(text_test)\n", + "eval_result_constrained = evaluate_results(\n", + " labels_test, scores_constrained_test, groups_test, group_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ustp5z7xQnHI" + }, + "source": [ + "As with last time, remember to select false_positive_rate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ztK7iM4LjKmT" + }, + "outputs": [], + "source": [ + "plot_fairness_indicators(eval_result_constrained, \"Constrained\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6P6dxSg5_mTu" + }, + "outputs": [], + "source": [ + "multi_results = {\n", + " 'constrained':eval_result_constrained,\n", + " 'unconstrained':eval_result_unconstrained,\n", + "}\n", + "plot_multi_fairness_indicators(multi_eval_results=multi_results)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EfKo5O3QWs9-" + }, + "source": [ + "As we can see from the Fairness Indicators, compared to the unconstrained model the constrained model yields significantly lower false positive rates for the sexuality-related comments, and does so with only a slight dip in the overall accuracy." + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Fairness Indicators TFCO Wiki Comments Case Study.ipynb", + "private_outputs": true, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb b/docs/tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb new file mode 100644 index 00000000..4033ca49 --- /dev/null +++ b/docs/tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb @@ -0,0 +1,410 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_E4uORykIpG4" + }, + "source": [ + "##### Copyright 2020 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "aBT221yVIujn" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aalPefrUUplk" + }, + "source": [ + "# Fairness Indicators TensorBoard Plugin Example Colab" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fFTJpyFlI-uI" + }, + "source": [ + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UZ48WFLwbCL6" + }, + "source": [ + "##Overview\n", + "\n", + "In this activity, you'll use [Fairness Indicators for TensorBoard](https://github.com/tensorflow/tensorboard/tree/master/docs/fairness-indicators.md). With the plugin, you can visualize fairness evaluations for your runs and easily compare performance across groups.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u33JXdluZ2lG" + }, + "source": [ + "# Importing\n", + "\n", + "Run the following code to install the required libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EoRNffG599XP" + }, + "outputs": [], + "source": [ + "!pip install -q -U pip==20.2\n", + "\n", + "!pip install fairness_indicators 'absl-py<0.9,>=0.7'\n", + "!pip install google-api-python-client==1.8.3\n", + "!pip install tensorboard-plugin-fairness-indicators\n", + "!pip install tensorflow-serving-api==2.17.1\n", + "!pip install tensorflow-model-analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mglfaM4_mtIk" + }, + "source": [ + "**Restart the runtime.** After the runtime is restarted, continue with following cells without running previous cell again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sFZJ8f_M7mlc" + }, + "outputs": [], + "source": [ + "# %tf.disable_v2_behavior()\t# Uncomment this line if running in Google Colab." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B8dlyTyiTe-9" + }, + "outputs": [], + "source": [ + "import datetime\n", + "import os\n", + "import tempfile\n", + "from tensorboard_plugin_fairness_indicators import summary_v2\n", + "import tensorflow.compat.v1 as tf\n", + "import numpy as np\n", + "from tensorflow import keras\n", + "from google.protobuf import text_format\n", + "\n", + "# example_model.py is provided in fairness_indicators package to train and\n", + "# evaluate an example model.\n", + "from fairness_indicators import example_model\n", + "import tensorflow_model_analysis as tfma\n", + "\n", + "tf.compat.v1.enable_eager_execution()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TsplOJGqWCf5" + }, + "source": [ + "# Data and Constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NdLBi6tN5i7I" + }, + "outputs": [], + "source": [ + "# To know about dataset, check Fairness Indicators Example Colab at:\n", + "# https://github.com/tensorflow/fairness-indicators/blob/master/docs/tutorials/Fairness_Indicators_Example_Colab.ipynb\n", + "\n", + "train_tf_file = tf.keras.utils.get_file('train.tf', 'https://storage.googleapis.com/civil_comments_dataset/train_tf_processed.tfrecord')\n", + "validate_tf_file = tf.keras.utils.get_file('validate.tf', 'https://storage.googleapis.com/civil_comments_dataset/validate_tf_processed.tfrecord')\n", + "\n", + "BASE_DIR = tempfile.gettempdir()\n", + "TEXT_FEATURE = 'comment_text'\n", + "LABEL = 'toxicity'\n", + "FEATURE_MAP = {\n", + " # Label:\n", + " LABEL: tf.io.FixedLenFeature([], tf.float32),\n", + " # Text:\n", + " TEXT_FEATURE: tf.io.FixedLenFeature([], tf.string),\n", + "\n", + " # Identities:\n", + " 'sexual_orientation': tf.io.VarLenFeature(tf.string),\n", + " 'gender': tf.io.VarLenFeature(tf.string),\n", + " 'religion': tf.io.VarLenFeature(tf.string),\n", + " 'race': tf.io.VarLenFeature(tf.string),\n", + " 'disability': tf.io.VarLenFeature(tf.string),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mfbgerCsEOmN" + }, + "source": [ + "# Train the Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YwoC-dzEDid3" + }, + "outputs": [], + "source": [ + "model_dir = os.path.join(BASE_DIR, 'train',\n", + " datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VqjEYySbYaX5" + }, + "outputs": [], + "source": [ + "classifier = example_model.get_example_model(example_model.TEXT_FEATURE)\n", + "classifier.compile(optimizer=keras.optimizers.Adam(), loss='mse')\n", + "\n", + "# Read the data from the training file\n", + "data = []\n", + "dataset = tf.data.Dataset.list_files(train_tf_file, shuffle=False)\n", + "dataset = dataset.flat_map(tf.data.TFRecordDataset)\n", + "for raw_record in dataset.take(1):\n", + " example = tf.train.Example()\n", + " example.ParseFromString(raw_record.numpy())\n", + " data.append(example)\n", + "\n", + "classifier.fit(\n", + " tf.constant([e.SerializeToString() for e in data]),\n", + " np.array([\n", + " e.features.feature[example_model.LABEL].float_list.value[:][0]\n", + " for e in data\n", + " ]),\n", + ")\n", + "classifier.save(model_dir, save_format='tf')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jTPqije9Eg5b" + }, + "source": [ + "# Run TensorFlow Model Analysis with Fairness Indicators\n", + "This step might take 2 to 5 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QLjiy5VCzlRw" + }, + "outputs": [], + "source": [ + "tfma_eval_result_path = os.path.join(BASE_DIR, 'tfma_eval_result')\n", + "\n", + "eval_config = text_format.Parse(\n", + " \"\"\"\n", + " model_specs {\n", + " signature_name: \"serving_default\"\n", + " prediction_key: \"predictions\" # placeholder\n", + " label_key: \"toxicity\" # placeholder\n", + " }\n", + " slicing_specs {}\n", + " slicing_specs {\n", + " feature_keys: [\"gender\"]\n", + " }\n", + " metrics_specs {\n", + " metrics {\n", + " class_name: \"ExampleCount\"\n", + " }\n", + " metrics {\n", + " class_name: \"FairnessIndicators\"\n", + " }\n", + " }\n", + "\"\"\",\n", + " tfma.EvalConfig(),\n", + ")\n", + "\n", + "tfma_eval_result_path = os.path.join(model_dir, 'tfma_eval_result')\n", + "example_model.evaluate_model(\n", + " model_dir,\n", + " validate_tf_file,\n", + " tfma_eval_result_path,\n", + " eval_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U1ROnulYc8Ub" + }, + "source": [ + "# Visualize Fairness Indicators in TensorBoard\n", + "\n", + "\n", + "Below you will visualize Fairness Indicators in Tensorboard and compare performance of each slice of the data on selected metrics. You can adjust the baseline comparison slice as well as the displayed threshold(s) using the drop down menus at the top of the visualization. You can also select different evaluation runs using the drop down menu at the top-left corner." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zCV-Jo0xda6g" + }, + "source": [ + "## Write Fairness Indicators Summary\n", + "Write summary file containing all required information to visualize Fairness Indicators in TensorBoard." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JNaNhTCTAMHm" + }, + "outputs": [], + "source": [ + "import tensorflow.compat.v2 as tf2\n", + "\n", + "writer = tf2.summary.create_file_writer(\n", + " os.path.join(model_dir, 'fairness_indicators'))\n", + "with writer.as_default():\n", + " summary_v2.FairnessIndicators(tfma_eval_result_path, step=1)\n", + "writer.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MB2Gfm9BdXVY" + }, + "source": [ + "## Launch TensorBoard\n", + "Navigate to \"Fairness Indicators\" tab to visualize Fairness Indicators." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UiHhDWu8tyEI" + }, + "outputs": [], + "source": [ + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ix6d718udWsK" + }, + "outputs": [], + "source": [ + "%tensorboard --logdir=$model_dir" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb b/docs/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb new file mode 100644 index 00000000..a2b9cc66 --- /dev/null +++ b/docs/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb @@ -0,0 +1,539 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Tce3stUlHN0L" + }, + "source": [ + "##### Copyright 2020 The TensorFlow Authors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "tuOe1ymfHZPu" + }, + "outputs": [], + "source": [ + "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aalPefrUUplk" + }, + "source": [ + "# Fairness Indicators on TF-Hub Text Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MfBg1C5NB3X0" + }, + "source": [ + "
\n", + " \n", + "
\n", + " \n", + " View on TensorFlow.org\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Run in Google Colab\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " View source on GitHub\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " Download notebook\n", + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w0zsksbydmNp" + }, + "source": [ + "In this tutorial, you will learn how to use [Fairness Indicators](https://github.com/tensorflow/fairness-indicators) to evaluate embeddings from [TF Hub](https://www.tensorflow.org/hub). This notebook uses the [Civil Comments dataset](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u33JXdluZ2lG" + }, + "source": [ + "## Setup\n", + "\n", + "Install the required libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BAUEkqYlzP3W" + }, + "outputs": [], + "source": [ + "!pip install -q -U pip==20.2\n", + "\n", + "!pip install fairness-indicators \\\n", + " \"absl-py==0.12.0\" \\\n", + " \"pyarrow==10.0.1\" \\\n", + " \"apache-beam==2.50.0\" \\\n", + " \"avro-python3==1.9.1\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e6pe8c6L7kCW" + }, + "source": [ + "Import other required libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B8dlyTyiTe-9" + }, + "outputs": [], + "source": [ + "import os\n", + "import tempfile\n", + "import apache_beam as beam\n", + "from datetime import datetime\n", + "import tensorflow as tf\n", + "import tensorflow_hub as hub\n", + "import tensorflow_model_analysis as tfma\n", + "from tensorflow_model_analysis.addons.fairness.view import widget_view\n", + "from tensorflow_model_analysis.addons.fairness.post_export_metrics import fairness_indicators\n", + "from fairness_indicators import example_model\n", + "from fairness_indicators.tutorial_utils import util" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Xz4PcI0hSVcq" + }, + "source": [ + "### Dataset\n", + "\n", + "In this notebook, you work with the [Civil Comments dataset](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification) which contains approximately 2 million public comments made public by the [Civil Comments platform](https://github.com/reaktivstudios/civil-comments) in 2017 for ongoing research. This effort was sponsored by Jigsaw, who have hosted competitions on Kaggle to help classify toxic comments as well as minimize unintended model bias.\n", + "\n", + "Each individual text comment in the dataset has a toxicity label, with the label being 1 if the comment is toxic and 0 if the comment is non-toxic. Within the data, a subset of comments are labeled with a variety of identity attributes, including categories for gender, sexual orientation, religion, and race or ethnicity." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9ekzb7vVnPCc" + }, + "source": [ + "### Prepare the data\n", + "\n", + "TensorFlow parses features from data using [`tf.io.FixedLenFeature`](https://www.tensorflow.org/api_docs/python/tf/io/FixedLenFeature) and [`tf.io.VarLenFeature`](https://www.tensorflow.org/api_docs/python/tf/io/VarLenFeature). Map out the input feature, output feature, and all other slicing features of interest." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n4_nXQDykX6W" + }, + "outputs": [], + "source": [ + "BASE_DIR = tempfile.gettempdir()\n", + "\n", + "# The input and output features of the classifier\n", + "TEXT_FEATURE = 'comment_text'\n", + "LABEL = 'toxicity'\n", + "\n", + "FEATURE_MAP = {\n", + " # input and output features\n", + " LABEL: tf.io.FixedLenFeature([], tf.float32),\n", + " TEXT_FEATURE: tf.io.FixedLenFeature([], tf.string),\n", + "\n", + " # slicing features\n", + " 'sexual_orientation': tf.io.VarLenFeature(tf.string),\n", + " 'gender': tf.io.VarLenFeature(tf.string),\n", + " 'religion': tf.io.VarLenFeature(tf.string),\n", + " 'race': tf.io.VarLenFeature(tf.string),\n", + " 'disability': tf.io.VarLenFeature(tf.string)\n", + "}\n", + "\n", + "IDENTITY_TERMS = ['gender', 'sexual_orientation', 'race', 'religion', 'disability']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CeUtnaT49Doq" + }, + "source": [ + "By default, the notebook downloads a preprocessed version of this dataset, but\n", + "you may use the original dataset and re-run the processing steps if\n", + "desired.\n", + "\n", + "In the original dataset, each comment is labeled with the percentage\n", + "of raters who believed that a comment corresponds to a particular\n", + "identity. For example, a comment might be labeled with the following:\n", + "`{ male: 0.3, female: 1.0, transgender: 0.0, heterosexual: 0.8,\n", + "homosexual_gay_or_lesbian: 1.0 }`.\n", + "\n", + "The processing step groups identity by category (gender,\n", + "sexual_orientation, etc.) and removes identities with a score less\n", + "than 0.5. So the example above would be converted to the following:\n", + "of raters who believed that a comment corresponds to a particular\n", + "identity. For example, the comment above would be labeled with the\n", + "following:\n", + "`{ gender: [female], sexual_orientation: [heterosexual,\n", + "homosexual_gay_or_lesbian] }`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHxa31VX9eP2" + }, + "source": [ + "Download the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NUmSmqYGS0n8" + }, + "outputs": [], + "source": [ + "download_original_data = False #@param {type:\"boolean\"}\n", + "\n", + "if download_original_data:\n", + " train_tf_file = tf.keras.utils.get_file('train_tf.tfrecord',\n", + " 'https://storage.googleapis.com/civil_comments_dataset/train_tf.tfrecord')\n", + " validate_tf_file = tf.keras.utils.get_file('validate_tf.tfrecord',\n", + " 'https://storage.googleapis.com/civil_comments_dataset/validate_tf.tfrecord')\n", + "\n", + " # The identity terms list will be grouped together by their categories\n", + " # (see 'IDENTITY_COLUMNS') on threshold 0.5. Only the identity term column,\n", + " # text column and label column will be kept after processing.\n", + " train_tf_file = util.convert_comments_data(train_tf_file)\n", + " validate_tf_file = util.convert_comments_data(validate_tf_file)\n", + "\n", + "else:\n", + " train_tf_file = tf.keras.utils.get_file('train_tf_processed.tfrecord',\n", + " 'https://storage.googleapis.com/civil_comments_dataset/train_tf_processed.tfrecord')\n", + " validate_tf_file = tf.keras.utils.get_file('validate_tf_processed.tfrecord',\n", + " 'https://storage.googleapis.com/civil_comments_dataset/validate_tf_processed.tfrecord')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zz1NLR5Uu3oQ" + }, + "source": [ + "## Create a TensorFlow Model Analysis Pipeline\n", + "\n", + "The Fairness Indicators library operates on [TensorFlow Model Analysis (TFMA) models](https://tensorflow.github.io/model-analysis/get_started). TFMA models wrap TensorFlow models with additional functionality to evaluate and visualize their results. The actual evaluation occurs inside of an [Apache Beam pipeline](https://beam.apache.org/documentation/programming-guide/).\n", + "\n", + "The steps you follow to create a TFMA pipeline are:\n", + "1. Build a TensorFlow model\n", + "2. Build a TFMA model on top of the TensorFlow model\n", + "3. Run the model analysis in an orchestrator. The example model in this notebook uses Apache Beam as the orchestrator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7nSvu4IUCigW" + }, + "outputs": [], + "source": [ + "def embedding_fairness_result(embedding, identity_term='gender'):\n", + " \n", + " model_dir = os.path.join(BASE_DIR, 'train',\n", + " datetime.now().strftime('%Y%m%d-%H%M%S'))\n", + "\n", + " print(\"Training classifier for \" + embedding)\n", + " classifier = example_model.train_model(model_dir,\n", + " train_tf_file,\n", + " LABEL,\n", + " TEXT_FEATURE,\n", + " FEATURE_MAP,\n", + " embedding)\n", + "\n", + " # Create a unique path to store the results for this embedding.\n", + " embedding_name = embedding.split('/')[-2]\n", + " eval_result_path = os.path.join(BASE_DIR, 'eval_result', embedding_name)\n", + "\n", + " example_model.evaluate_model(classifier,\n", + " validate_tf_file,\n", + " eval_result_path,\n", + " identity_term,\n", + " LABEL,\n", + " FEATURE_MAP)\n", + " return tfma.load_eval_result(output_path=eval_result_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jTPqije9Eg5b" + }, + "source": [ + "## Run TFMA & Fairness Indicators" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8AvInTNt8Gyn" + }, + "source": [ + "### Fairness Indicators Metrics\n", + "\n", + "Some of the metrics available with Fairness Indicators are:\n", + "\n", + "* [Negative Rate, False Negative Rate (FNR), and True Negative Rate (TNR)](https://en.wikipedia.org/wiki/False_positives_and_false_negatives#False_positive_and_false_negative_rates)\n", + "* [Positive Rate, False Positive Rate (FPR), and True Positive Rate (TPR)](https://en.wikipedia.org/wiki/False_positives_and_false_negatives#False_positive_and_false_negative_rates)\n", + "* [Accuracy](https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Accuracy)\n", + "* [Precision and Recall](https://en.wikipedia.org/wiki/Precision_and_recall)\n", + "* [Precision-Recall AUC](https://www.tensorflow.org/api_docs/python/tf/keras/metrics/AUC)\n", + "* [ROC AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LGXCFtScblYt" + }, + "source": [ + "### Text Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1CI-1M5qXGjG" + }, + "source": [ + "**[TF-Hub](https://www.tensorflow.org/hub)** provides several **text embeddings**. These embeddings will serve as the feature column for the different models. This tutorial uses the following embeddings:\n", + "\n", + "* [**random-nnlm-en-dim128**](https://tfhub.dev/google/random-nnlm-en-dim128/1): random text embeddings, this serves as a convenient baseline.\n", + "* [**nnlm-en-dim128**](https://tfhub.dev/google/nnlm-en-dim128/1): a text embedding based on [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf). \n", + "* [**universal-sentence-encoder**](https://tfhub.dev/google/universal-sentence-encoder/2): a text embedding based on [Universal Sentence Encoder](https://arxiv.org/pdf/1803.11175.pdf)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xxq97Qt7itVL" + }, + "source": [ + "## Fairness Indicator Results" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "27FX15awixuK" + }, + "source": [ + "Compute fairness indicators with the `embedding_fairness_result` pipeline, and then render the results in the Fairness Indicator UI widget with `widget_view.render_fairness_indicator` for all the above embeddings.\n", + "\n", + "Note: You may need to run the `widget_view.render_fairness_indicator` cells twice for the visualization to be displayed." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yEUbZ93y8NCW" + }, + "source": [ + "#### Random NNLM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DkSuox-Pb6Pz" + }, + "outputs": [], + "source": [ + "eval_result_random_nnlm = embedding_fairness_result('https://tfhub.dev/google/random-nnlm-en-dim128/1')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "05xUesz6VpAe" + }, + "outputs": [], + "source": [ + "widget_view.render_fairness_indicator(eval_result=eval_result_random_nnlm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jmKe8Z1b8SBy" + }, + "source": [ + "#### NNLM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5b8HcTUBckj1" + }, + "outputs": [], + "source": [ + "eval_result_nnlm = embedding_fairness_result('https://tfhub.dev/google/nnlm-en-dim128/1')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n6hasLzFVrDN" + }, + "outputs": [], + "source": [ + "widget_view.render_fairness_indicator(eval_result=eval_result_nnlm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1I4xEDNq8T0X" + }, + "source": [ + "#### Universal Sentence Encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GrdweWRkck8A" + }, + "outputs": [], + "source": [ + "eval_result_use = embedding_fairness_result('https://tfhub.dev/google/universal-sentence-encoder/2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JBABAkZMVtTK" + }, + "outputs": [], + "source": [ + "widget_view.render_fairness_indicator(eval_result=eval_result_use)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "402oTKbap77R" + }, + "source": [ + "### Comparing Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UgnqwNjpqBuv" + }, + "source": [ + "You can also use Fairness Indicators to compare embeddings directly. For example, compare the models generated from the NNLM and USE embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "49ECfYWUp7Kk" + }, + "outputs": [], + "source": [ + "widget_view.render_fairness_indicator(multi_eval_results={'nnlm': eval_result_nnlm, 'use': eval_result_use})" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Fairness Indicators on TF-Hub Text Embeddings", + "private_outputs": true, + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/g3doc/tutorials/README.md b/docs/tutorials/README.md similarity index 100% rename from g3doc/tutorials/README.md rename to docs/tutorials/README.md diff --git a/g3doc/tutorials/_Deprecated_Fairness_Indicators_Lineage_Case_Study.ipynb b/docs/tutorials/_Deprecated_Fairness_Indicators_Lineage_Case_Study.ipynb similarity index 99% rename from g3doc/tutorials/_Deprecated_Fairness_Indicators_Lineage_Case_Study.ipynb rename to docs/tutorials/_Deprecated_Fairness_Indicators_Lineage_Case_Study.ipynb index 53b6188c..8756ce3d 100644 --- a/g3doc/tutorials/_Deprecated_Fairness_Indicators_Lineage_Case_Study.ipynb +++ b/docs/tutorials/_Deprecated_Fairness_Indicators_Lineage_Case_Study.ipynb @@ -53,7 +53,7 @@ " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_Lineage_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", " \u003c/td\u003e\n", " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/fairness-indicators/tree/master/g3doc/tutorials/Fairness_Indicators_Lineage_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", + " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/fairness-indicators/tree/master/docs/tutorials/Fairness_Indicators_Lineage_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", " \u003c/td\u003e\n", " \u003ctd\u003e\n", " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/fairness-indicators/g3doc/tutorials/Fairness_Indicators_Lineage_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", @@ -97,7 +97,7 @@ "\n", "* **[TensorFlow Model Analysis](https://www.tensorflow.org/tfx/tutorials/model_analysis/tfma_basic)** is a library for evaluating machine learning models. Users can evaluate their models on a large amount of data in a distributed manner and view metrics over different slices within a notebook.\n", "\n", - "* **[Fairness Indicators](https://www.tensorflow.org/tfx/guide/fairness_indicators)** is a suite of tools built on top of TensorFlow Model Analysis that enables regular evaluation of fairness metrics in product pipelines.\n", + "* **[Fairness Indicators](https://tensorflow.github.io/fairness-indicators)** is a suite of tools built on top of TensorFlow Model Analysis that enables regular evaluation of fairness metrics in product pipelines.\n", "\n", "* **[ML Metadata](https://www.tensorflow.org/tfx/guide/mlmd)** is a library for recording and retrieving the lineage and metadata of ML artifacts such as models, datasets, and metrics. Within TFX ML Metadata will help us understand the artifacts created in a pipeline, which is a unit of data that is passed between TFX components.\n", "\n", @@ -121,7 +121,7 @@ "## Helpful Resources\n", "This case study is an extension of the below case studies. It is recommended working through the below case studies first. \n", "* [TFX Pipeline Overview](https://github.com/tensorflow/workshops/blob/master/tfx_labs/Lab_1_Pipeline_in_Colab.ipynb)\n", - "* [Fairness Indicator Case Study](https://github.com/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_Example_Colab.ipynb)\n", + "* [Fairness Indicator Case Study](https://github.com/tensorflow/fairness-indicators/blob/master/docs/tutorials/Fairness_Indicators_Example_Colab.ipynb)\n", "* [TFX Data Validation](https://github.com/tensorflow/tfx/blob/master/tfx/examples/airflow_workshop/notebooks/step3.ipynb)\n", "\n", "\n", diff --git a/g3doc/tutorials/_toc.yaml b/docs/tutorials/_toc.yaml similarity index 100% rename from g3doc/tutorials/_toc.yaml rename to docs/tutorials/_toc.yaml diff --git a/g3doc/tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb b/g3doc/tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb deleted file mode 100644 index 71dcdffe..00000000 --- a/g3doc/tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb +++ /dev/null @@ -1,409 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Sxt-9qpNgPxo" - }, - "source": [ - "##### Copyright 2020 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Phnw6c3-gQ1f" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aalPefrUUplk" - }, - "source": [ - "# FaceSSD Fairness Indicators Example Colab" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KFRBcGOYgEAI" - }, - "source": [ - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Facessd_Fairness_Indicators_Example_Colab\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/fairness-indicators/tree/master/g3doc/tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/fairness-indicators/g3doc/tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UZ48WFLwbCL6" - }, - "source": [ - "##Overview\n", - "\n", - "In this activity, you'll use [Fairness Indicators](https://www.tensorflow.org/tfx/guide/fairness_indicators) to explore the [FaceSSD predictions on Labeled Faces in the Wild dataset](https://modelcards.withgoogle.com/face-detection). Fairness Indicators is a suite of tools built on top of [TensorFlow Model Analysis](https://www.tensorflow.org/tfx/model_analysis/get_started) that enable regular evaluation of fairness metrics in product pipelines.\n", - "\n", - "##About the Dataset\n", - "\n", - "In this exercise, you'll work with the FaceSSD prediction dataset, approximately 200k different image predictions and groundtruths generated by FaceSSD API.\n", - "\n", - "##About the Tools\n", - "\n", - "[TensorFlow Model Analysis](https://www.tensorflow.org/tfx/model_analysis/get_started) is a library for evaluating both TensorFlow and non-TensorFlow machine learning models. It allows users to evaluate their models on large amounts of data in a distributed manner, computing in-graph and other metrics over different slices of data and visualize in notebooks.\n", - "\n", - "[TensorFlow Data Validation](https://www.tensorflow.org/tfx/data_validation/get_started) is one tool you can use to analyze your data. You can use it to find potential problems in your data, such as missing values and data imbalances, that can lead to Fairness disparities.\n", - "\n", - "With [Fairness Indicators](https://www.tensorflow.org/tfx/guide/fairness_indicators), users will be able to: \n", - "\n", - "* Evaluate model performance, sliced across defined groups of users\n", - "* Feel confident about results with confidence intervals and evaluations at multiple thresholds" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u33JXdluZ2lG" - }, - "source": [ - "# Importing\n", - "\n", - "Run the following code to install the fairness_indicators library. This package contains the tools we'll be using in this exercise. Restart Runtime may be requested but is not necessary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EoRNffG599XP" - }, - "outputs": [], - "source": [ - "!pip install apache_beam\n", - "!pip install fairness-indicators\n", - "!pip install witwidget\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "B8dlyTyiTe-9" - }, - "outputs": [], - "source": [ - "import os\n", - "import tempfile\n", - "import apache_beam as beam\n", - "import numpy as np\n", - "import pandas as pd\n", - "from datetime import datetime\n", - "\n", - "import tensorflow_hub as hub\n", - "import tensorflow as tf\n", - "import tensorflow_model_analysis as tfma\n", - "import tensorflow_data_validation as tfdv\n", - "from tensorflow_model_analysis.addons.fairness.post_export_metrics import fairness_indicators\n", - "from tensorflow_model_analysis.addons.fairness.view import widget_view\n", - "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_predict as agnostic_predict\n", - "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_evaluate_graph\n", - "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_extractor\n", - "\n", - "from witwidget.notebook.visualization import WitConfigBuilder\n", - "from witwidget.notebook.visualization import WitWidget" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TsplOJGqWCf5" - }, - "source": [ - "# Download and Understand the Data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vFOQ4AaIcAn2" - }, - "source": [ - "[Labeled Faces in the Wild](http://vis-www.cs.umass.edu/lfw/) is a public benchmark dataset for face verification, also known as pair matching. LFW contains more than 13,000 images of faces collected from the web.\n", - "\n", - "We ran FaceSSD predictions on this dataset to predict whether a face is present in a given image. In this Colab, we will slice data according to gender to observe if there are any significant differences between model performance for different gender groups.\n", - "\n", - "If there is more than one face in an image, gender is labeled as \"MISSING\".\n", - "\n", - "We've hosted the dataset on Google Cloud Platform for convenience. Run the following code to download the data from GCP, the data will take about a minute to download and analyze." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NdLBi6tN5i7I" - }, - "outputs": [], - "source": [ - "data_location = tf.keras.utils.get_file('lfw_dataset.tf', 'https://storage.googleapis.com/facessd_dataset/lfw_dataset.tfrecord')\n", - "\n", - "stats = tfdv.generate_statistics_from_tfrecord(data_location=data_location)\n", - "tfdv.visualize_statistics(stats)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cNODEwE5x7Uo" - }, - "source": [ - "# Defining Constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZF4NO87uFxdQ" - }, - "outputs": [], - "source": [ - "BASE_DIR = tempfile.gettempdir()\n", - "\n", - "tfma_eval_result_path = os.path.join(BASE_DIR, 'tfma_eval_result')\n", - "\n", - "compute_confidence_intervals = True\n", - "\n", - "slice_key = 'object/groundtruth/Gender'\n", - "label_key = 'object/groundtruth/face'\n", - "prediction_key = 'object/prediction/face'\n", - "\n", - "feature_map = {\n", - " slice_key:\n", - " tf.io.FixedLenFeature([], tf.string, default_value=['none']),\n", - " label_key:\n", - " tf.io.FixedLenFeature([], tf.float32, default_value=[0.0]),\n", - " prediction_key:\n", - " tf.io.FixedLenFeature([], tf.float32, default_value=[0.0]),\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gVLHwuhEyI8R" - }, - "source": [ - "# Model Agnostic Config for TFMA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ej1nGCZSyJIK" - }, - "outputs": [], - "source": [ - "model_agnostic_config = agnostic_predict.ModelAgnosticConfig(\n", - " label_keys=[label_key],\n", - " prediction_keys=[prediction_key],\n", - " feature_spec=feature_map)\n", - "\n", - "model_agnostic_extractors = [\n", - " model_agnostic_extractor.ModelAgnosticExtractor(\n", - " model_agnostic_config=model_agnostic_config, desired_batch_size=3),\n", - " tfma.extractors.slice_key_extractor.SliceKeyExtractor(\n", - " [tfma.slicer.SingleSliceSpec(),\n", - " tfma.slicer.SingleSliceSpec(columns=[slice_key])])\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wqkk9SkvyVkR" - }, - "source": [ - "# Fairness Callbacks and Computing Fairness Metrics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "A0icrlliBCOb" - }, - "outputs": [], - "source": [ - "# Helper class for counting examples in beam PCollection\n", - "class CountExamples(beam.CombineFn):\n", - " def __init__(self, message):\n", - " self.message = message\n", - "\n", - " def create_accumulator(self):\n", - " return 0\n", - "\n", - " def add_input(self, current_sum, element):\n", - " return current_sum + 1\n", - "\n", - " def merge_accumulators(self, accumulators): \n", - " return sum(accumulators)\n", - "\n", - " def extract_output(self, final_sum):\n", - " if final_sum:\n", - " print(\"%s: %d\"%(self.message, final_sum))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mRQjdjp9yVv2" - }, - "outputs": [], - "source": [ - "metrics_callbacks = [\n", - " tfma.post_export_metrics.fairness_indicators(\n", - " thresholds=[0.1, 0.3, 0.5, 0.7, 0.9],\n", - " labels_key=label_key,\n", - " target_prediction_keys=[prediction_key]),\n", - " tfma.post_export_metrics.auc(\n", - " curve='PR',\n", - " labels_key=label_key,\n", - " target_prediction_keys=[prediction_key]),\n", - "]\n", - "\n", - "eval_shared_model = tfma.types.EvalSharedModel(\n", - " add_metrics_callbacks=metrics_callbacks,\n", - " construct_fn=model_agnostic_evaluate_graph.make_construct_fn(\n", - " add_metrics_callbacks=metrics_callbacks,\n", - " config=model_agnostic_config))\n", - "\n", - "with beam.Pipeline() as pipeline:\n", - " # Read data.\n", - " data = (\n", - " pipeline\n", - " | 'ReadData' \u003e\u003e beam.io.ReadFromTFRecord(data_location))\n", - "\n", - " # Count all examples.\n", - " data_count = (\n", - " data | 'Count number of examples' \u003e\u003e beam.CombineGlobally(\n", - " CountExamples('Before filtering \"Gender:MISSING\"')))\n", - "\n", - " # If there are more than one face in image, the gender feature is 'MISSING'\n", - " # and we are filtering that image out.\n", - " def filter_missing_gender(element):\n", - " example = tf.train.Example.FromString(element)\n", - " if example.features.feature[slice_key].bytes_list.value[0] != b'MISSING':\n", - " yield element\n", - "\n", - " filtered_data = (\n", - " data\n", - " | 'Filter Missing Gender' \u003e\u003e beam.ParDo(filter_missing_gender))\n", - "\n", - " # Count after filtering \"Gender:MISSING\".\n", - " filtered_data_count = (\n", - " filtered_data | 'Count number of examples after filtering'\n", - " \u003e\u003e beam.CombineGlobally(\n", - " CountExamples('After filtering \"Gender:MISSING\"')))\n", - "\n", - " # Because LFW data set has always faces by default, we are adding\n", - " # labels as 1.0 for all images.\n", - " def add_face_groundtruth(element):\n", - " example = tf.train.Example.FromString(element)\n", - " example.features.feature[label_key].float_list.value[:] = [1.0]\n", - " yield example.SerializeToString()\n", - "\n", - " final_data = (\n", - " filtered_data\n", - " | 'Add Face Groundtruth' \u003e\u003e beam.ParDo(add_face_groundtruth))\n", - "\n", - " # Run TFMA.\n", - " _ = (\n", - " final_data\n", - " | 'ExtractEvaluateAndWriteResults' \u003e\u003e\n", - " tfma.ExtractEvaluateAndWriteResults(\n", - " eval_shared_model=eval_shared_model,\n", - " compute_confidence_intervals=compute_confidence_intervals,\n", - " output_path=tfma_eval_result_path,\n", - " extractors=model_agnostic_extractors))\n", - "\n", - "eval_result = tfma.load_eval_result(output_path=tfma_eval_result_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktlASJQIzE3l" - }, - "source": [ - "# Render Fairness Indicators\n", - "\n", - "Render the Fairness Indicators widget with the exported evaluation results.\n", - "\n", - "Below you will see bar charts displaying performance of each slice of the data on selected metrics. You can adjust the baseline comparison slice as well as the displayed threshold(s) using the drop down menus at the top of the visualization.\n", - "\n", - "A relevant metric for this use case is true positive rate, also known as recall. Use the selector on the left hand side to choose the graph for true_positive_rate. These metric values match the values displayed on the [model card](https://modelcards.withgoogle.com/face-detection).\n", - "\n", - "For some photos, gender is labeled as young instead of male or female, if the person in the photo is too young to be accurately annotated." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JNaNhTCTAMHm" - }, - "outputs": [], - "source": [ - "widget_view.render_fairness_indicator(eval_result=eval_result,\n", - " slicing_column=slice_key)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [ - "Sxt-9qpNgPxo" - ], - "name": "Facessd Fairness Indicators Example Colab.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/g3doc/tutorials/Fairness_Indicators_Example_Colab.ipynb b/g3doc/tutorials/Fairness_Indicators_Example_Colab.ipynb deleted file mode 100644 index 00a40a74..00000000 --- a/g3doc/tutorials/Fairness_Indicators_Example_Colab.ipynb +++ /dev/null @@ -1,712 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "Tce3stUlHN0L" - }, - "source": [ - "##### Copyright 2020 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tuOe1ymfHZPu" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aalPefrUUplk" - }, - "source": [ - "# Introduction to Fairness Indicators" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MfBg1C5NB3X0" - }, - "source": [ - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Fairness_Indicators_Example_Colab\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_Example_Colab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_Example_Colab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/fairness-indicators/g3doc/tutorials/Fairness_Indicators_Example_Colab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://tfhub.dev/google/random-nnlm-en-dim128/1\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/hub_logo_32px.png\" /\u003eSee TF Hub model\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YWcPbUNg1yez" - }, - "source": [ - "## Overview\n", - "\n", - "Fairness Indicators is a suite of tools built on top of [TensorFlow Model Analysis (TFMA)](https://www.tensorflow.org/tfx/model_analysis/get_started) that enable regular evaluation of fairness metrics in product pipelines. TFMA is a library for evaluating both TensorFlow and non-TensorFlow machine learning models. It allows you to evaluate your models on large amounts of data in a distributed manner, compute in-graph and other metrics over different slices of data, and visualize them in notebooks. \n", - "\n", - "Fairness Indicators is packaged with [TensorFlow Data Validation (TFDV)](https://www.tensorflow.org/tfx/data_validation/get_started) and the [What-If Tool](https://pair-code.github.io/what-if-tool/). Using Fairness Indicators allows you to: \n", - "\n", - "* Evaluate model performance, sliced across defined groups of users\n", - "* Gain confidence about results with confidence intervals and evaluations at multiple thresholds\n", - "* Evaluate the distribution of datasets\n", - "* Dive deep into individual slices to explore root causes and opportunities for improvement\n", - "\n", - "In this notebook, you will use Fairness Indicators to fix fairness issues in a model you train using the [Civil Comments dataset](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification). Watch this [video](https://www.youtube.com/watch?v=pHT-ImFXPQo) for more details and context on the real-world scenario this is based on which is also one of primary motivations for creating Fairness Indicators." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GjuCFktB2IJW" - }, - "source": [ - "## Dataset\n", - "\n", - "In this notebook, you will work with the [Civil Comments dataset](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification), approximately 2 million public comments made public by the [Civil Comments platform](https://medium.com/@aja_15265/saying-goodbye-to-civil-comments-41859d3a2b1d) in 2017 for ongoing research. This effort was sponsored by [Jigsaw](https://jigsaw.google.com/), who have hosted competitions on Kaggle to help classify toxic comments as well as minimize unintended model bias.\n", - "\n", - "Each individual text comment in the dataset has a toxicity label, with the label being 1 if the comment is toxic and 0 if the comment is non-toxic. Within the data, a subset of comments are labeled with a variety of identity attributes, including categories for gender, sexual orientation, religion, and race or ethnicity." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u33JXdluZ2lG" - }, - "source": [ - "## Setup\n", - "\n", - "Install `fairness-indicators` and `witwidget`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EoRNffG599XP" - }, - "outputs": [], - "source": [ - "!pip install -q -U pip==20.2\n", - "\n", - "!pip install -q fairness-indicators\n", - "!pip install -q witwidget" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "alYUSbyv59j5" - }, - "source": [ - "You must restart the Colab runtime after installing. Select **Runtime \u003e Restart** runtime from the Colab menu.\n", - "\n", - "Do not proceed with the rest of this tutorial without first restarting the runtime." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RbRUqXDm6f1N" - }, - "source": [ - "Import all other required libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "B8dlyTyiTe-9" - }, - "outputs": [], - "source": [ - "import os\n", - "import tempfile\n", - "import apache_beam as beam\n", - "import numpy as np\n", - "import pandas as pd\n", - "from datetime import datetime\n", - "import pprint\n", - "\n", - "from google.protobuf import text_format\n", - "\n", - "import tensorflow_hub as hub\n", - "import tensorflow as tf\n", - - "import tensorflow_model_analysis as tfma\n", - "import tensorflow_data_validation as tfdv\n", - "\n", - "from tfx_bsl.tfxio import tensor_adapter\n", - "from tfx_bsl.tfxio import tf_example_record\n", - "\n", - "from tensorflow_model_analysis.addons.fairness.post_export_metrics import fairness_indicators\n", - "from tensorflow_model_analysis.addons.fairness.view import widget_view\n", - "\n", - "from fairness_indicators.tutorial_utils import util\n", - "\n", - "from witwidget.notebook.visualization import WitConfigBuilder\n", - "from witwidget.notebook.visualization import WitWidget\n", - "\n", - "from tensorflow_metadata.proto.v0 import schema_pb2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TsplOJGqWCf5" - }, - "source": [ - "## Download and analyze the data\n", - "\n", - "By default, this notebook downloads a preprocessed version of this dataset, but you may use the original dataset and re-run the processing steps if desired. In the original dataset, each comment is labeled with the percentage of raters who believed that a comment corresponds to a particular identity. For example, a comment might be labeled with the following: { male: 0.3, female: 1.0, transgender: 0.0, heterosexual: 0.8, homosexual_gay_or_lesbian: 1.0 } The processing step groups identity by category (gender, sexual_orientation, etc.) and removes identities with a score less than 0.5. So the example above would be converted to the following: of raters who believed that a comment corresponds to a particular identity. For example, the comment would be labeled with the following: { gender: [female], sexual_orientation: [heterosexual, homosexual_gay_or_lesbian] }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qmt4gkBFRBD2" - }, - "outputs": [], - "source": [ - "download_original_data = False #@param {type:\"boolean\"}\n", - "\n", - "if download_original_data:\n", - " train_tf_file = tf.keras.utils.get_file('train_tf.tfrecord',\n", - " 'https://storage.googleapis.com/civil_comments_dataset/train_tf.tfrecord')\n", - " validate_tf_file = tf.keras.utils.get_file('validate_tf.tfrecord',\n", - " 'https://storage.googleapis.com/civil_comments_dataset/validate_tf.tfrecord')\n", - "\n", - " # The identity terms list will be grouped together by their categories\n", - " # (see 'IDENTITY_COLUMNS') on threshould 0.5. Only the identity term column,\n", - " # text column and label column will be kept after processing.\n", - " train_tf_file = util.convert_comments_data(train_tf_file)\n", - " validate_tf_file = util.convert_comments_data(validate_tf_file)\n", - "\n", - "else:\n", - " train_tf_file = tf.keras.utils.get_file('train_tf_processed.tfrecord',\n", - " 'https://storage.googleapis.com/civil_comments_dataset/train_tf_processed.tfrecord')\n", - " validate_tf_file = tf.keras.utils.get_file('validate_tf_processed.tfrecord',\n", - " 'https://storage.googleapis.com/civil_comments_dataset/validate_tf_processed.tfrecord')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vFOQ4AaIcAn2" - }, - "source": [ - "Use TFDV to analyze the data and find potential problems in it, such as missing values and data imbalances, that can lead to fairness disparities." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NdLBi6tN5i7I" - }, - "outputs": [], - "source": [ - "stats = tfdv.generate_statistics_from_tfrecord(data_location=train_tf_file)\n", - "tfdv.visualize_statistics(stats)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AS9QiA96GXDE" - }, - "source": [ - "TFDV shows that there are some significant imbalances in the data which could lead to biased model outcomes. \n", - "\n", - "* The toxicity label (the value predicted by the model) is unbalanced. Only 8% of the examples in the training set are toxic, which means that a classifier could get 92% accuracy by predicting that all comments are non-toxic.\n", - "\n", - "* In the fields relating to identity terms, only 6.6k out of the 1.08 million (0.61%) training examples deal with homosexuality, and those related to bisexuality are even more rare. This indicates that performance on these slices may suffer due to lack of training data." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9ekzb7vVnPCc" - }, - "source": [ - "## Prepare the data\n", - "\n", - "Define a feature map to parse the data. Each example will have a label, comment text, and identity features `sexual orientation`, `gender`, `religion`, `race`, and `disability` that are associated with the text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "n4_nXQDykX6W" - }, - "outputs": [], - "source": [ - "BASE_DIR = tempfile.gettempdir()\n", - "\n", - "TEXT_FEATURE = 'comment_text'\n", - "LABEL = 'toxicity'\n", - "FEATURE_MAP = {\n", - " # Label:\n", - " LABEL: tf.io.FixedLenFeature([], tf.float32),\n", - " # Text:\n", - " TEXT_FEATURE: tf.io.FixedLenFeature([], tf.string),\n", - "\n", - " # Identities:\n", - " 'sexual_orientation':tf.io.VarLenFeature(tf.string),\n", - " 'gender':tf.io.VarLenFeature(tf.string),\n", - " 'religion':tf.io.VarLenFeature(tf.string),\n", - " 'race':tf.io.VarLenFeature(tf.string),\n", - " 'disability':tf.io.VarLenFeature(tf.string),\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1B1ROCM__y8C" - }, - "source": [ - "Next, set up an input function to feed data into the model. Add a weight column to each example and upweight the toxic examples to account for the class imbalance identified by the TFDV. Use only identity features during the evaluation phase, as only the comments are fed into the model during training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YwoC-dzEDid3" - }, - "outputs": [], - "source": [ - "def train_input_fn():\n", - " def parse_function(serialized):\n", - " parsed_example = tf.io.parse_single_example(\n", - " serialized=serialized, features=FEATURE_MAP)\n", - " # Adds a weight column to deal with unbalanced classes.\n", - " parsed_example['weight'] = tf.add(parsed_example[LABEL], 0.1)\n", - " return (parsed_example,\n", - " parsed_example[LABEL])\n", - " train_dataset = tf.data.TFRecordDataset(\n", - " filenames=[train_tf_file]).map(parse_function).batch(512)\n", - " return train_dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mfbgerCsEOmN" - }, - "source": [ - "## Train the model\n", - "\n", - "Create and train a deep learning model on the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JaGvNrVijfws" - }, - "outputs": [], - "source": [ - "model_dir = os.path.join(BASE_DIR, 'train', datetime.now().strftime(\n", - " \"%Y%m%d-%H%M%S\"))\n", - "\n", - "embedded_text_feature_column = hub.text_embedding_column(\n", - " key=TEXT_FEATURE,\n", - " module_spec='https://tfhub.dev/google/nnlm-en-dim128/1')\n", - "\n", - "classifier = tf.estimator.DNNClassifier(\n", - " hidden_units=[500, 100],\n", - " weight_column='weight',\n", - " feature_columns=[embedded_text_feature_column],\n", - " optimizer=tf.keras.optimizers.legacy.Adagrad(learning_rate=0.003),\n", - " loss_reduction=tf.losses.Reduction.SUM,\n", - " n_classes=2,\n", - " model_dir=model_dir)\n", - "\n", - "classifier.train(input_fn=train_input_fn, steps=1000)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jTPqije9Eg5b" - }, - "source": [ - "## Analyze the model\n", - "\n", - "After obtaining the trained model, analyze it to compute fairness metrics using TFMA and Fairness Indicators. Begin by exporting the model as a [SavedModel](https://www.tensorflow.org/guide/saved_model). " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-vRc-Jyp8dRm" - }, - "source": [ - "### Export SavedModel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QLjiy5VCzlRw" - }, - "outputs": [], - "source": [ - "def eval_input_receiver_fn():\n", - " serialized_tf_example = tf.compat.v1.placeholder(\n", - " dtype=tf.string, shape=[None], name='input_example_placeholder')\n", - "\n", - " # This *must* be a dictionary containing a single key 'examples', which\n", - " # points to the input placeholder.\n", - " receiver_tensors = {'examples': serialized_tf_example}\n", - "\n", - " features = tf.io.parse_example(serialized_tf_example, FEATURE_MAP)\n", - " features['weight'] = tf.ones_like(features[LABEL])\n", - "\n", - " return tfma.export.EvalInputReceiver(\n", - " features=features,\n", - " receiver_tensors=receiver_tensors,\n", - " labels=features[LABEL])\n", - "\n", - "tfma_export_dir = tfma.export.export_eval_savedmodel(\n", - " estimator=classifier,\n", - " export_dir_base=os.path.join(BASE_DIR, 'tfma_eval_model'),\n", - " eval_input_receiver_fn=eval_input_receiver_fn)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3j8ODcee8rQ8" - }, - "source": [ - "### Compute Fairness Metrics\n", - "\n", - "Select the identity to compute metrics for and whether to run with confidence intervals using the dropdown in the panel on the right." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7shDmJbx9mqa" - }, - "outputs": [], - "source": [ - "#@title Fairness Indicators Computation Options\n", - "tfma_eval_result_path = os.path.join(BASE_DIR, 'tfma_eval_result')\n", - "\n", - "#@markdown Modify the slice_selection for experiments on other identities.\n", - "slice_selection = 'sexual_orientation' #@param [\"sexual_orientation\", \"gender\", \"religion\", \"race\", \"disability\"]\n", - "print(f'Slice selection: {slice_selection}')\n", - "#@markdown Confidence Intervals can help you make better decisions regarding your data, but as it requires computing multiple resamples, is slower particularly in the colab environment that cannot take advantage of parallelization.\n", - "compute_confidence_intervals = False #@param {type:\"boolean\"}\n", - "print(f'Compute confidence intervals: {compute_confidence_intervals}')\n", - "\n", - "# Define slices that you want the evaluation to run on.\n", - "eval_config_pbtxt = \"\"\"\n", - " model_specs {\n", - " label_key: \"%s\"\n", - " }\n", - " metrics_specs {\n", - " metrics {\n", - " class_name: \"FairnessIndicators\"\n", - " config: '{ \"thresholds\": [0.1, 0.3, 0.5, 0.7, 0.9] }'\n", - " }\n", - " }\n", - " slicing_specs {} # overall slice\n", - " slicing_specs {\n", - " feature_keys: [\"%s\"]\n", - " }\n", - " options {\n", - " compute_confidence_intervals { value: %s }\n", - " disabled_outputs { values: \"analysis\" }\n", - " }\n", - " \"\"\" % (LABEL, slice_selection, compute_confidence_intervals)\n", - "eval_config = text_format.Parse(eval_config_pbtxt, tfma.EvalConfig())\n", - "eval_shared_model = tfma.default_eval_shared_model(\n", - " eval_saved_model_path=tfma_export_dir)\n", - "\n", - "schema = text_format.Parse(\n", - " \"\"\"\n", - " tensor_representation_group {\n", - " key: \"\"\n", - " value {\n", - " tensor_representation {\n", - " key: \"comment_text\"\n", - " value {\n", - " dense_tensor {\n", - " column_name: \"comment_text\"\n", - " shape {}\n", - " }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " feature {\n", - " name: \"comment_text\"\n", - " type: BYTES\n", - " }\n", - " feature {\n", - " name: \"toxicity\"\n", - " type: FLOAT\n", - " }\n", - " feature {\n", - " name: \"sexual_orientation\"\n", - " type: BYTES\n", - " }\n", - " feature {\n", - " name: \"gender\"\n", - " type: BYTES\n", - " }\n", - " feature {\n", - " name: \"religion\"\n", - " type: BYTES\n", - " }\n", - " feature {\n", - " name: \"race\"\n", - " type: BYTES\n", - " }\n", - " feature {\n", - " name: \"disability\"\n", - " type: BYTES\n", - " }\n", - " \"\"\", schema_pb2.Schema())\n", - "tfxio = tf_example_record.TFExampleRecord(\n", - " file_pattern=validate_tf_file,\n", - " schema=schema,\n", - " raw_record_column_name=tfma.ARROW_INPUT_COLUMN)\n", - "tensor_adapter_config = tensor_adapter.TensorAdapterConfig(\n", - " arrow_schema=tfxio.ArrowSchema(),\n", - " tensor_representations=tfxio.TensorRepresentations())\n", - "\n", - "with beam.Pipeline() as pipeline:\n", - " (pipeline\n", - " | 'ReadFromTFRecordToArrow' \u003e\u003e tfxio.BeamSource()\n", - " | 'ExtractEvaluateAndWriteResults' \u003e\u003e tfma.ExtractEvaluateAndWriteResults(\n", - " eval_config=eval_config,\n", - " eval_shared_model=eval_shared_model,\n", - " output_path=tfma_eval_result_path,\n", - " tensor_adapter_config=tensor_adapter_config))\n", - "\n", - "eval_result = tfma.load_eval_result(output_path=tfma_eval_result_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jtDpTBPeRw2d" - }, - "source": [ - "### Visualize data using the What-if Tool\n", - "\n", - "In this section, you'll use the What-If Tool's interactive visual interface to explore and manipulate data at a micro-level.\n", - "\n", - "Each point on the scatter plot on the right-hand panel represents one of the examples in the subset loaded into the tool. Click on one of the points to see details about this particular example in the left-hand panel. The comment text, ground truth toxicity, and applicable identities are shown. At the bottom of this left-hand panel, you see the inference results from the model you just trained.\n", - "\n", - "Modify the text of the example and then click the **Run inference** button to view how your changes caused the perceived toxicity prediction to change." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wtjZo4BDlV1m" - }, - "outputs": [], - "source": [ - "DEFAULT_MAX_EXAMPLES = 1000\n", - "\n", - "# Load 100000 examples in memory. When first rendered, \n", - "# What-If Tool should only display 1000 of these due to browser constraints.\n", - "def wit_dataset(file, num_examples=100000):\n", - " dataset = tf.data.TFRecordDataset(\n", - " filenames=[file]).take(num_examples)\n", - " return [tf.train.Example.FromString(d.numpy()) for d in dataset]\n", - "\n", - "wit_data = wit_dataset(train_tf_file)\n", - "config_builder = WitConfigBuilder(wit_data[:DEFAULT_MAX_EXAMPLES]).set_estimator_and_feature_spec(\n", - " classifier, FEATURE_MAP).set_label_vocab(['non-toxicity', LABEL]).set_target_feature(LABEL)\n", - "wit = WitWidget(config_builder)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktlASJQIzE3l" - }, - "source": [ - "## Render Fairness Indicators\n", - "\n", - "Render the Fairness Indicators widget with the exported evaluation results.\n", - "\n", - "Below you will see bar charts displaying performance of each slice of the data on selected metrics. You can adjust the baseline comparison slice as well as the displayed threshold(s) using the dropdown menus at the top of the visualization. \n", - "\n", - "The Fairness Indicator widget is integrated with the What-If Tool rendered above. If you select one slice of the data in the bar chart, the What-If Tool will update to show you examples from the selected slice. When the data reloads in the What-If Tool above, try modifying **Color By** to **toxicity**. This can give you a visual understanding of the toxicity balance of examples by slice." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JNaNhTCTAMHm" - }, - "outputs": [], - "source": [ - "event_handlers={'slice-selected':\n", - " wit.create_selection_callback(wit_data, DEFAULT_MAX_EXAMPLES)}\n", - "widget_view.render_fairness_indicator(eval_result=eval_result,\n", - " slicing_column=slice_selection,\n", - " event_handlers=event_handlers\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nRuZsLr6V_fY" - }, - "source": [ - "With this particular dataset and task, systematically higher false positive and false negative rates for certain identities can lead to negative consequences. For example, in a content moderation system, a higher-than-overall false positive rate for a certain group can lead to those voices being silenced. Thus, it is important to regularly evaluate these types of criteria as you develop and improve models, and utilize tools such as Fairness Indicators, TFDV, and WIT to help illuminate potential problems. Once you've identified fairness issues, you can experiment with new data sources, data balancing, or other techniques to improve performance on underperforming groups.\n", - "\n", - "See [here](https://tensorflow.org/responsible_ai/fairness_indicators/guide/guidance) for more information and guidance on how to use Fairness Indicators.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wCMEMtGfx0Ti" - }, - "source": [ - "## Use fairness evaluation results\n", - "\n", - "The [`eval_result`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult) object, rendered above in `render_fairness_indicator()`, has its own API that you can leverage to read TFMA results into your programs." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z6stkMLwyfza" - }, - "source": [ - "### Get evaluated slices and metrics\n", - "\n", - "Use [`get_slice_names()`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult#get_slice_names) and [`get_metric_names()`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult#get_metric_names) to get the evaluated slices and metrics, respectively." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eXrt7SdZyzWD" - }, - "outputs": [], - "source": [ - "pp = pprint.PrettyPrinter()\n", - "\n", - "print(\"Slices:\")\n", - "pp.pprint(eval_result.get_slice_names())\n", - "print(\"\\nMetrics:\")\n", - "pp.pprint(eval_result.get_metric_names())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ctAvudY2zUu4" - }, - "source": [ - "Use [`get_metrics_for_slice()`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult#get_metrics_for_slice) to get the metrics for a particular slice as a dictionary mapping metric names to [metric values](https://github.com/tensorflow/model-analysis/blob/cdb6790dcd7a37c82afb493859b3ef4898963fee/tensorflow_model_analysis/proto/metrics_for_slice.proto#L194)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zjCxZGHmzF0R" - }, - "outputs": [], - "source": [ - "baseline_slice = ()\n", - "heterosexual_slice = (('sexual_orientation', 'heterosexual'),)\n", - "\n", - "print(\"Baseline metric values:\")\n", - "pp.pprint(eval_result.get_metrics_for_slice(baseline_slice))\n", - "print(\"\\nHeterosexual metric values:\")\n", - "pp.pprint(eval_result.get_metrics_for_slice(heterosexual_slice))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UDo3LhoR0Rq1" - }, - "source": [ - "Use [`get_metrics_for_all_slices()`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult#get_metrics_for_all_slices) to get the metrics for all slices as a dictionary mapping each slice to the corresponding metrics dictionary you obtain from running `get_metrics_for_slice()` on it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "96N2l2xI0fZd" - }, - "outputs": [], - "source": [ - "pp.pprint(eval_result.get_metrics_for_all_slices())" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Fairness Indicators Example Colab.ipynb", - "private_outputs": true, - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/g3doc/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb b/g3doc/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb deleted file mode 100644 index d80f3e15..00000000 --- a/g3doc/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb +++ /dev/null @@ -1,467 +0,0 @@ -{ - "cells": [ - { - "metadata": { - "id": "Bfrh3DUze0QN" - }, - "cell_type": "markdown", - "source": [ - "##### Copyright 2020 The TensorFlow Authors." - ] - }, - { - "metadata": { - "id": "sx-jnufYfcJG" - }, - "cell_type": "code", - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "s1bQihY6-Y4N" - }, - "cell_type": "markdown", - "source": [ - "# Pandas DataFrame to Fairness Indicators Case Study\n" - ] - }, - { - "metadata": { - "id": "XHTjeiUMeolM" - }, - "cell_type": "markdown", - "source": [ - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Fairness_Indicators_Pandas_Case_Study\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/fairness-indicators/tree/master/g3doc/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/fairness-indicators/g3doc/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "metadata": { - "id": "ay80altXzvgZ" - }, - "cell_type": "markdown", - "source": [ - "## Case Study Overview\n", - "In this case study we will apply [TensorFlow Model Analysis](https://www.tensorflow.org/tfx/model_analysis/get_started) and [Fairness Indicators](https://www.tensorflow.org/tfx/guide/fairness_indicators) to evaluate data stored as a Pandas DataFrame, where each row contains ground truth labels, various features, and a model prediction. We will show how this workflow can be used to spot potential fairness concerns, independent of the framework one used to construct and train the model. As in this case study, we can analyze the results from any machine learning framework (e.g. TensorFlow, JAX, etc) once they are converted to a Pandas DataFrame.\n", - " \n", - "For this exercise, we will leverage the Deep Neural Network (DNN) model that was developed in the [Shape Constraints for Ethics with Tensorflow Lattice](https://colab.research.google.com/github/tensorflow/lattice/blob/master/docs/tutorials/shape_constraints_for_ethics.ipynb#scrollTo=uc0VwsT5nvQi) case study using the Law School Admissions dataset from the Law School Admissions Council (LSAC). This classifier attempts to predict whether or not a student will pass the bar, based on their Law School Admission Test (LSAT) score and undergraduate GPA.\n", - "\n", - "## LSAC Dataset\n", - "The dataset used within this case study was originally collected for a study called '[LSAC National Longitudinal Bar Passage Study. LSAC Research Report Series](https://eric.ed.gov/?id=ED469370)' by Linda Wightman in 1998. The dataset is currently hosted [here](http://www.seaphe.org/databases.php).\n", - "\n", - "* **dnn_bar_pass_prediction**: The LSAT prediction from the DNN model.\n", - "* **gender**: Gender of the student.\n", - "* **lsat**: LSAT score received by the student.\n", - "* **pass_bar**: Ground truth label indicating whether or not the student eventually passed the bar.\n", - "* **race**: Race of the student.\n", - "* **ugpa**: A student's undergraduate GPA.\n" - ] - }, - { - "metadata": { - "id": "Ob01ASKqixfw" - }, - "cell_type": "code", - "source": [ - "!pip install -q -U pip==20.2\n", - "\n", - "!pip install -q -U \\\n", - " tensorflow-model-analysis==0.48.0 \\\n", - " tensorflow-data-validation==1.17.0 \\\n", - " tfx-bsl==1.17.1" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "tnxSvgkaSEIj" - }, - "cell_type": "markdown", - "source": [ - "## Importing required packages:" - ] - }, - { - "metadata": { - "id": "0q8cTfpTkEMP" - }, - "cell_type": "code", - "source": [ - "import os\n", - "import tempfile\n", - "import pandas as pd\n", - "import six.moves.urllib as urllib\n", - "import pprint\n", - "\n", - "import tensorflow_model_analysis as tfma\n", - "from google.protobuf import text_format\n", - "\n", - "import tensorflow as tf\n", - "tf.compat.v1.enable_v2_behavior()" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "b8kWW3t4-eS1" - }, - "cell_type": "markdown", - "source": [ - "## Download the data and explore the initial dataset." - ] - }, - { - "metadata": { - "id": "wMZJtgj0qJ0x" - }, - "cell_type": "code", - "source": [ - "# Download the LSAT dataset and setup the required filepaths.\n", - "_DATA_ROOT = tempfile.mkdtemp(prefix='lsat-data')\n", - "_DATA_PATH = 'https://storage.googleapis.com/lawschool_dataset/bar_pass_prediction.csv'\n", - "_DATA_FILEPATH = os.path.join(_DATA_ROOT, 'bar_pass_prediction.csv')\n", - "\n", - "data = urllib.request.urlopen(_DATA_PATH)\n", - "\n", - "_LSAT_DF = pd.read_csv(data)\n", - "\n", - "# To simpliy the case study, we will only use the columns that will be used for\n", - "# our model.\n", - "_COLUMN_NAMES = [\n", - " 'dnn_bar_pass_prediction',\n", - " 'gender',\n", - " 'lsat',\n", - " 'pass_bar',\n", - " 'race1',\n", - " 'ugpa',\n", - "]\n", - "\n", - "_LSAT_DF.dropna()\n", - "_LSAT_DF['gender'] = _LSAT_DF['gender'].astype(str)\n", - "_LSAT_DF['race1'] = _LSAT_DF['race1'].astype(str)\n", - "_LSAT_DF = _LSAT_DF[_COLUMN_NAMES]\n", - "\n", - "_LSAT_DF.head()" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "GyeVg2s7-wlB" - }, - "cell_type": "markdown", - "source": [ - "## Configure Fairness Indicators.\n", - "There are several parameters that you’ll need to take into account when using Fairness Indicators with a DataFrame \n", - "\n", - "* Your input DataFrame must contain a prediction column and label column from your model. By default Fairness Indicators will look for a prediction column called `prediction` and a label column called `label` within your DataFrame.\n", - " * If either of these values are not found a KeyError will be raised.\n", - "\n", - "* In addition to a DataFrame, you’ll also need to include an `eval_config` that should include the metrics to compute, slices to compute the metrics on, and the column names for example labels and predictions. \n", - " * `metrics_specs` will set the metrics to compute. The `FairnessIndicators` metric will be required to render the fairness metrics and you can see a list of additional optional metrics [here](https://www.tensorflow.org/tfx/model_analysis/metrics).\n", - "\n", - " * `slicing_specs` is an optional slicing parameter to specify what feature you’re interested in investigating. Within this case study race1 is used, however you can also set this value to another feature (for example gender in the context of this DataFrame). If `slicing_specs` is not provided all features will be included.\n", - " * If your DataFrame includes a label or prediction column that is different from the default `prediction` or `label`, you can configure the `label_key` and `prediction_key` to a new value.\n", - "\n", - "* If `output_path` is not specified a temporary directory will be created." - ] - }, - { - "metadata": { - "id": "53caFasB5V9p" - }, - "cell_type": "code", - "source": [ - "# Specify Fairness Indicators in eval_config.\n", - "eval_config = text_format.Parse(\"\"\"\n", - " model_specs {\n", - " prediction_key: 'dnn_bar_pass_prediction',\n", - " label_key: 'pass_bar'\n", - " }\n", - " metrics_specs {\n", - " metrics {class_name: \"AUC\"}\n", - " metrics {\n", - " class_name: \"FairnessIndicators\"\n", - " config: '{\"thresholds\": [0.50, 0.90]}'\n", - " }\n", - " }\n", - " slicing_specs {\n", - " feature_keys: 'race1'\n", - " }\n", - " slicing_specs {}\n", - " \"\"\", tfma.EvalConfig())\n", - "\n", - "# Run TensorFlow Model Analysis.\n", - "eval_result = tfma.analyze_raw_data(\n", - " data=_LSAT_DF,\n", - " eval_config=eval_config,\n", - " output_path=_DATA_ROOT)" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "KD96mw0e--DE" - }, - "cell_type": "markdown", - "source": [ - "## Explore model performance with Fairness Indicators.\n", - "\n", - "After running Fairness Indicators, we can visualize different metrics that we selected to analyze our models performance. Within this case study we’ve included Fairness Indicators and arbitrarily picked AUC.\n", - "\n", - "When we first look at the overall AUC for each race slice we can see a slight discrepancy in model performance, but nothing that is arguably alarming.\n", - "\n", - "* **Asian**: 0.58\n", - "* **Black**: 0.58\n", - "* **Hispanic**: 0.58\n", - "* **Other**: 0.64\n", - "* **White**: 0.6\n", - "\n", - "However, when we look at the false negative rates split by race, our model again incorrectly predicts the likelihood of a user passing the bar at different rates and, this time, does so by a lot. \n", - "\n", - "* **Asian**: 0.01\n", - "* **Black**: 0.05\n", - "* **Hispanic**: 0.02\n", - "* **Other**: 0.01\n", - "* **White**: 0.01\n", - "\n", - "Most notably the difference between Black and White students is about 380%, meaning that our model is nearly 4x more likely to incorrectly predict that a black student will not pass the bar, than a whilte student. If we were to continue with this effort, a practitioner could use these results as a signal that they should spend more time ensuring that their model works well for people from all backgrounds." - ] - }, - { - "metadata": { - "id": "NIdchYPb-_ZV" - }, - "cell_type": "code", - "source": [ - "# Render Fairness Indicators.\n", - "tfma.addons.fairness.view.widget_view.render_fairness_indicator(eval_result)" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "NprhBTCbY1sF" - }, - "cell_type": "markdown", - "source": [ - "# tfma.EvalResult" - ] - }, - { - "metadata": { - "id": "6f92-e98Y40r" - }, - "cell_type": "markdown", - "source": [ - "The [`eval_result`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult) object, rendered above in `render_fairness_indicator()`, has its own API that can be used to read TFMA results into your programs." - ] - }, - { - "metadata": { - "id": "CDDUxdx-Y8e0" - }, - "cell_type": "markdown", - "source": [ - "## [`get_slice_names()`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult#get_slice_names) and [`get_metric_names()`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult#get_metric_names)" - ] - }, - { - "metadata": { - "id": "oG_mNUNbY98t" - }, - "cell_type": "markdown", - "source": [ - "To get the evaluated slices and metrics, you can use the respective functions." - ] - }, - { - "metadata": { - "id": "kbA1sXhCY_G7" - }, - "cell_type": "code", - "source": [ - "pp = pprint.PrettyPrinter()\n", - "\n", - "print(\"Slices:\")\n", - "pp.pprint(eval_result.get_slice_names())\n", - "print(\"\\nMetrics:\")\n", - "pp.pprint(eval_result.get_metric_names())" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "rA1M8aBmZAk6" - }, - "cell_type": "markdown", - "source": [ - "## [`get_metrics_for_slice()`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult#get_metrics_for_slice) and [`get_metrics_for_all_slices()`](https://www.tensorflow.org/tfx/model_analysis/api_docs/python/tfma/EvalResult#get_metrics_for_all_slices)" - ] - }, - { - "metadata": { - "id": "a3Ath5MsZCRX" - }, - "cell_type": "markdown", - "source": [ - "If you want to get the metrics for a particular slice, you can use `get_metrics_for_slice()`. It returns a dictionary mapping metric names to [metric values](https://github.com/tensorflow/model-analysis/blob/cdb6790dcd7a37c82afb493859b3ef4898963fee/tensorflow_model_analysis/proto/metrics_for_slice.proto#L194)." - ] - }, - { - "metadata": { - "id": "9BWg5HoyZDh-" - }, - "cell_type": "code", - "source": [ - "baseline_slice = ()\n", - "black_slice = (('race1', 'black'),)\n", - "\n", - "print(\"Baseline metric values:\")\n", - "pp.pprint(eval_result.get_metrics_for_slice(baseline_slice))\n", - "print(\"Black metric values:\")\n", - "pp.pprint(eval_result.get_metrics_for_slice(black_slice))" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "bDcOxvqBZEfg" - }, - "cell_type": "markdown", - "source": [ - "If you want to get the metrics for all slices, `get_metrics_for_all_slices()` returns a dictionary mapping each slice to the corresponding `get_metrics_for_slices(slice)`." - ] - }, - { - "metadata": { - "id": "p4NQCi52ZFrw" - }, - "cell_type": "code", - "source": [ - "pp.pprint(eval_result.get_metrics_for_all_slices())" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "y-nbqnSTkmW3" - }, - "cell_type": "markdown", - "source": [ - "## Conclusion\n", - "Within this case study we imported a dataset into a Pandas DataFrame that we then analyzed with Fairness Indicators. Understanding the results of your model and underlying data is an important step in ensuring your model doesn't reflect harmful bias. In the context of this case study we examined the the LSAC dataset and how predictions from this data could be impacted by a students race. The concept of “what is unfair and what is fair have been introduced in multiple disciplines for well over 50 years, including in education, hiring, and machine learning.”\u003csup\u003e1\u003c/sup\u003e Fairness Indicator is a tool to help mitigate fairness concerns in your machine learning model.\n", - "\n", - "For more information on using Fairness Indicators and resources to learn more about fairness concerns see [here](https://www.tensorflow.org/responsible_ai/fairness_indicators/guide).\n", - "\n", - "---\n", - "\n", - "1. Hutchinson, B., Mitchell, M. (2018). 50 Years of Test (Un)fairness: Lessons for Machine Learning. https://arxiv.org/abs/1811.10104\n" - ] - }, - { - "metadata": { - "id": "REV1rBnoBAo1" - }, - "cell_type": "markdown", - "source": [ - "## Appendix\n", - "\n", - "Below are a few functions to help convert ML models to Pandas DataFrame.\n" - ] - }, - { - "metadata": { - "id": "F4qv9GXiBsFA" - }, - "cell_type": "code", - "source": [ - "# TensorFlow Estimator to Pandas DataFrame:\n", - "\n", - "# _X_VALUE = # X value of binary estimator.\n", - "# _Y_VALUE = # Y value of binary estimator.\n", - "# _GROUND_TRUTH_LABEL = # Ground truth value of binary estimator.\n", - "\n", - "def _get_predicted_probabilities(estimator, input_df, get_input_fn):\n", - " predictions = estimator.predict(\n", - " input_fn=get_input_fn(input_df=input_df, num_epochs=1))\n", - " return [prediction['probabilities'][1] for prediction in predictions]\n", - "\n", - "def _get_input_fn_law(input_df, num_epochs, batch_size=None):\n", - " return tf.compat.v1.estimator.inputs.pandas_input_fn(\n", - " x=input_df[[_X_VALUE, _Y_VALUE]],\n", - " y=input_df[_GROUND_TRUTH_LABEL],\n", - " num_epochs=num_epochs,\n", - " batch_size=batch_size or len(input_df),\n", - " shuffle=False)\n", - "\n", - "def estimator_to_dataframe(estimator, input_df, num_keypoints=20):\n", - " x = np.linspace(min(input_df[_X_VALUE]), max(input_df[_X_VALUE]), num_keypoints)\n", - " y = np.linspace(min(input_df[_Y_VALUE]), max(input_df[_Y_VALUE]), num_keypoints)\n", - "\n", - " x_grid, y_grid = np.meshgrid(x, y)\n", - "\n", - " positions = np.vstack([x_grid.ravel(), y_grid.ravel()])\n", - " plot_df = pd.DataFrame(positions.T, columns=[_X_VALUE, _Y_VALUE])\n", - " plot_df[_GROUND_TRUTH_LABEL] = np.ones(len(plot_df))\n", - " predictions = _get_predicted_probabilities(\n", - " estimator=estimator, input_df=plot_df, get_input_fn=_get_input_fn_law)\n", - " return pd.DataFrame(\n", - " data=np.array(np.reshape(predictions, x_grid.shape)).flatten())" - ], - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "colab": { - "collapsed_sections": [ - "Bfrh3DUze0QN" - ], - "name": "Pandas DataFrame to Fairness Indicators Case Study", - "private_outputs": true, - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/g3doc/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb b/g3doc/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb deleted file mode 100644 index 0beb5612..00000000 --- a/g3doc/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb +++ /dev/null @@ -1,981 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "JmvzTcYice-_" - }, - "source": [ - "##### Copyright 2020 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zlvAS8a9cD_t" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b2VYQpTttmVN" - }, - "source": [ - "# TensorFlow Constrained Optimization Example Using CelebA Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3iFsS2WSeRwe" - }, - "source": [ - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/fairness-indicators/tree/master/g3doc/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/fairness-indicators/g3doc/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-DQoReGDeN16" - }, - "source": [ - "This notebook demonstrates an easy way to create and optimize constrained problems using the TFCO library. This method can be useful in improving models when we find that they’re not performing equally well across different slices of our data, which we can identify using [Fairness Indicators](https://www.tensorflow.org/responsible_ai/fairness_indicators/guide). The second of Google’s AI principles states that our technology should avoid creating or reinforcing unfair bias, and we believe this technique can help improve model fairness in some situations. In particular, this notebook will:\n", - "\n", - "\n", - "* Train a simple, *unconstrained* neural network model to detect a person's smile in images using [`tf.keras`](https://www.tensorflow.org/guide/keras) and the large-scale CelebFaces Attributes ([CelebA](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html)) dataset.\n", - "* Evaluate model performance against a commonly used fairness metric across age groups, using Fairness Indicators.\n", - "* Set up a simple constrained optimization problem to achieve fairer performance across age groups.\n", - "* Retrain the now *constrained* model and evaluate performance again, ensuring that our chosen fairness metric has improved.\n", - "\n", - "Last updated: 3/11 Feb 2020" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JyCbEWt5Zxe2" - }, - "source": [ - "# Installation\n", - "This notebook was created in [Colaboratory](https://research.google.com/colaboratory/faq.html), connected to the Python 3 Google Compute Engine backend. If you wish to host this notebook in a different environment, then you should not experience any major issues provided you include all the required packages in the cells below.\n", - "\n", - "Note that the very first time you run the pip installs, you may be asked to restart the runtime because of preinstalled out of date packages. Once you do so, the correct packages will be used." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "T-Zm-KDdt0bn" - }, - "outputs": [], - "source": [ - "#@title Pip installs\n", - "!pip install -q -U pip==20.2\n", - "\n", - "!pip install git+https://github.com/google-research/tensorflow_constrained_optimization\n", - "!pip install -q tensorflow-datasets tensorflow\n", - "!pip install fairness-indicators \\\n", - " \"absl-py==0.12.0\" \\\n", - " \"apache-beam\u003c3,\u003e=2.47\" \\\n", - " \"avro-python3==1.9.1\" \\\n", - " \"pyzmq==17.0.0\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UXWXhBLvISOY" - }, - "source": [ - "Note that depending on when you run the cell below, you may receive a warning about the default version of TensorFlow in Colab switching to TensorFlow 2.X soon. You can safely ignore that warning as this notebook was designed to be compatible with TensorFlow 1.X and 2.X." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UTBBdSGaZ8aW" - }, - "outputs": [], - "source": [ - "#@title Import Modules\n", - "import os\n", - "import sys\n", - "import tempfile\n", - "import urllib\n", - "\n", - "import tensorflow as tf\n", - - "from tensorflow import keras\n", - "\n", - "import tensorflow_datasets as tfds\n", - "tfds.disable_progress_bar()\n", - "\n", - "import numpy as np\n", - "\n", - "import tensorflow_constrained_optimization as tfco\n", - "\n", - "from tensorflow_metadata.proto.v0 import schema_pb2\n", - "from tfx_bsl.tfxio import tensor_adapter\n", - "from tfx_bsl.tfxio import tf_example_record" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "70tLum8uIZUm" - }, - "source": [ - "Additionally, we add a few imports that are specific to Fairness Indicators which we will use to evaluate and visualize the model's performance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "7Se0Z0Bo9K-5" - }, - "outputs": [], - "source": [ - "#@title Fairness Indicators related imports\n", - "import tensorflow_model_analysis as tfma\n", - "import fairness_indicators as fi\n", - "from google.protobuf import text_format\n", - "import apache_beam as beam" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xSG2HP7goGrj" - }, - "source": [ - "Although TFCO is compatible with eager and graph execution, this notebook assumes that eager execution is enabled by default as it is in TensorFlow 2.x. To ensure that nothing breaks, eager execution will be enabled in the cell below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "W0ZusW1-lBao" - }, - "outputs": [], - "source": [ - "#@title Enable Eager Execution and Print Versions\n", - "if tf.__version__ \u003c \"2.0.0\":\n", - " tf.compat.v1.enable_eager_execution()\n", - " print(\"Eager execution enabled.\")\n", - "else:\n", - " print(\"Eager execution enabled by default.\")\n", - "\n", - "print(\"TensorFlow \" + tf.__version__)\n", - "print(\"TFMA \" + tfma.VERSION_STRING)\n", - "print(\"TFDS \" + tfds.version.__version__)\n", - "print(\"FI \" + fi.version.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "idY3Uuk3yvty" - }, - "source": [ - "# CelebA Dataset\n", - "[CelebA](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) is a large-scale face attributes dataset with more than 200,000 celebrity images, each with 40 attribute annotations (such as hair type, fashion accessories, facial features, etc.) and 5 landmark locations (eyes, mouth and nose positions). For more details take a look at [the paper](https://liuziwei7.github.io/projects/FaceAttributes.html).\n", - "With the permission of the owners, we have stored this dataset on Google Cloud Storage and mostly access it via [TensorFlow Datasets(`tfds`)](https://www.tensorflow.org/datasets).\n", - "\n", - "In this notebook:\n", - "* Our model will attempt to classify whether the subject of the image is smiling, as represented by the \"Smiling\" attribute\u003csup\u003e*\u003c/sup\u003e.\n", - "* Images will be resized from 218x178 to 28x28 to reduce the execution time and memory when training.\n", - "* Our model's performance will be evaluated across age groups, using the binary \"Young\" attribute. We will call this \"age group\" in this notebook.\n", - "\n", - "___\n", - "\n", - "\u003csup\u003e*\u003c/sup\u003e While there is little information available about the labeling methodology for this dataset, we will assume that the \"Smiling\" attribute was determined by a pleased, kind, or amused expression on the subject's face. For the purpose of this case study, we will take these labels as ground truth.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zCSemFST0b89" - }, - "outputs": [], - "source": [ - "gcs_base_dir = \"gs://celeb_a_dataset/\"\n", - "celeb_a_builder = tfds.builder(\"celeb_a\", data_dir=gcs_base_dir, version='2.0.0')\n", - "\n", - "celeb_a_builder.download_and_prepare()\n", - "\n", - "num_test_shards_dict = {'0.3.0': 4, '2.0.0': 2} # Used because we download the test dataset separately\n", - "version = str(celeb_a_builder.info.version)\n", - "print('Celeb_A dataset version: %s' % version)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "Ocqv3R06APfW" - }, - "outputs": [], - "source": [ - "#@title Test dataset helper functions\n", - "local_root = tempfile.mkdtemp(prefix='test-data')\n", - "def local_test_filename_base():\n", - " return local_root\n", - "\n", - "def local_test_file_full_prefix():\n", - " return os.path.join(local_test_filename_base(), \"celeb_a-test.tfrecord\")\n", - "\n", - "def copy_test_files_to_local():\n", - " filename_base = local_test_file_full_prefix()\n", - " num_test_shards = num_test_shards_dict[version]\n", - " for shard in range(num_test_shards):\n", - " url = \"https://storage.googleapis.com/celeb_a_dataset/celeb_a/%s/celeb_a-test.tfrecord-0000%s-of-0000%s\" % (version, shard, num_test_shards)\n", - " filename = \"%s-0000%s-of-0000%s\" % (filename_base, shard, num_test_shards)\n", - " res = urllib.request.urlretrieve(url, filename)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u5PDLXZb_uIj" - }, - "source": [ - "## Caveats\n", - "Before moving forward, there are several considerations to keep in mind in using CelebA:\n", - "* Although in principle this notebook could use any dataset of face images, CelebA was chosen because it contains public domain images of public figures.\n", - "* All of the attribute annotations in CelebA are operationalized as binary categories. For example, the \"Young\" attribute (as determined by the dataset labelers) is denoted as either present or absent in the image.\n", - "* CelebA's categorizations do not reflect real human diversity of attributes.\n", - "* For the purposes of this notebook, the feature containing the \"Young\" attribute is referred to as \"age group\", where the presence of the \"Young\" attribute in an image is labeled as a member of the \"Young\" age group and the absence of the \"Young\" attribute is labeled as a member of the \"Not Young\" age group. These are assumptions made as this information is not mentioned in the [original paper](http://openaccess.thecvf.com/content_iccv_2015/html/Liu_Deep_Learning_Face_ICCV_2015_paper.html).\n", - "* As such, performance in the models trained in this notebook is tied to the ways the attributes have been operationalized and annotated by the authors of CelebA.\n", - "* This model should not be used for commercial purposes as that would violate [CelebA's non-commercial research agreement](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Elkiu92cY2bY" - }, - "source": [ - "# Setting Up Input Functions\n", - "The subsequent cells will help streamline the input pipeline as well as visualize performance.\n", - "\n", - "First we define some data-related variables and define a requisite preprocessing function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gDdarTZxk6y4" - }, - "outputs": [], - "source": [ - "#@title Define Variables\n", - "ATTR_KEY = \"attributes\"\n", - "IMAGE_KEY = \"image\"\n", - "LABEL_KEY = \"Smiling\"\n", - "GROUP_KEY = \"Young\"\n", - "IMAGE_SIZE = 28" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "SD-H70Je0cTp" - }, - "outputs": [], - "source": [ - "#@title Define Preprocessing Functions\n", - "def preprocess_input_dict(feat_dict):\n", - " # Separate out the image and target variable from the feature dictionary.\n", - " image = feat_dict[IMAGE_KEY]\n", - " label = feat_dict[ATTR_KEY][LABEL_KEY]\n", - " group = feat_dict[ATTR_KEY][GROUP_KEY]\n", - "\n", - " # Resize and normalize image.\n", - " image = tf.cast(image, tf.float32)\n", - " image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])\n", - " image /= 255.0\n", - "\n", - " # Cast label and group to float32.\n", - " label = tf.cast(label, tf.float32)\n", - " group = tf.cast(group, tf.float32)\n", - "\n", - " feat_dict[IMAGE_KEY] = image\n", - " feat_dict[ATTR_KEY][LABEL_KEY] = label\n", - " feat_dict[ATTR_KEY][GROUP_KEY] = group\n", - "\n", - " return feat_dict\n", - "\n", - "get_image_and_label = lambda feat_dict: (feat_dict[IMAGE_KEY], feat_dict[ATTR_KEY][LABEL_KEY])\n", - "get_image_label_and_group = lambda feat_dict: (feat_dict[IMAGE_KEY], feat_dict[ATTR_KEY][LABEL_KEY], feat_dict[ATTR_KEY][GROUP_KEY])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iwg3sPmExciD" - }, - "source": [ - "Then, we build out the data functions we need in the rest of the colab." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KbR64r0VVG5h" - }, - "outputs": [], - "source": [ - "# Train data returning either 2 or 3 elements (the third element being the group)\n", - "def celeb_a_train_data_wo_group(batch_size):\n", - " celeb_a_train_data = celeb_a_builder.as_dataset(split='train').shuffle(1024).repeat().batch(batch_size).map(preprocess_input_dict)\n", - " return celeb_a_train_data.map(get_image_and_label)\n", - "def celeb_a_train_data_w_group(batch_size):\n", - " celeb_a_train_data = celeb_a_builder.as_dataset(split='train').shuffle(1024).repeat().batch(batch_size).map(preprocess_input_dict)\n", - " return celeb_a_train_data.map(get_image_label_and_group)\n", - "\n", - "# Test data for the overall evaluation\n", - "celeb_a_test_data = celeb_a_builder.as_dataset(split='test').batch(1).map(preprocess_input_dict).map(get_image_label_and_group)\n", - "# Copy test data locally to be able to read it into tfma\n", - "copy_test_files_to_local()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NXO3woTxiCk0" - }, - "source": [ - "# Build a simple DNN Model\n", - "Because this notebook focuses on TFCO, we will assemble a simple, unconstrained `tf.keras.Sequential` model.\n", - "\n", - "We may be able to greatly improve model performance by adding some complexity (e.g., more densely-connected layers, exploring different activation functions, increasing image size), but that may distract from the goal of demonstrating how easy it is to apply the TFCO library when working with Keras. For that reason, the model will be kept simple — but feel encouraged to explore this space." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RNZhN_zU8DRD" - }, - "outputs": [], - "source": [ - "def create_model():\n", - " # For this notebook, accuracy will be used to evaluate performance.\n", - " METRICS = [\n", - " tf.keras.metrics.BinaryAccuracy(name='accuracy')\n", - " ]\n", - "\n", - " # The model consists of:\n", - " # 1. An input layer that represents the 28x28x3 image flatten.\n", - " # 2. A fully connected layer with 64 units activated by a ReLU function.\n", - " # 3. A single-unit readout layer to output real-scores instead of probabilities.\n", - " model = keras.Sequential([\n", - " keras.layers.Flatten(input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3), name='image'),\n", - " keras.layers.Dense(64, activation='relu'),\n", - " keras.layers.Dense(1, activation=None)\n", - " ])\n", - "\n", - " # TFCO by default uses hinge loss — and that will also be used in the model.\n", - " model.compile(\n", - " optimizer=tf.keras.optimizers.Adam(0.001),\n", - " loss='hinge',\n", - " metrics=METRICS)\n", - " return model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7A4uKPNVzPVO" - }, - "source": [ - "We also define a function to set seeds to ensure reproducible results. Note that this colab is meant as an educational tool and does not have the stability of a finely tuned production pipeline. Running without setting a seed may lead to varied results. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-IVw4EgKzqSF" - }, - "outputs": [], - "source": [ - "def set_seeds():\n", - " np.random.seed(121212)\n", - " tf.compat.v1.set_random_seed(212121)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Xrbjmmeom8pA" - }, - "source": [ - "# Fairness Indicators Helper Functions\n", - "Before training our model, we define a number of helper functions that will allow us to evaluate the model's performance via Fairness Indicators.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1EPF_k620CRN" - }, - "source": [ - "First, we create a helper function to save our model once we train it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ejHbhLW5epar" - }, - "outputs": [], - "source": [ - "def save_model(model, subdir):\n", - " base_dir = tempfile.mkdtemp(prefix='saved_models')\n", - " model_location = os.path.join(base_dir, subdir)\n", - " model.save(model_location, save_format='tf')\n", - " return model_location" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "erhKEvqByCNj" - }, - "source": [ - "Next, we define functions used to preprocess the data in order to correctly pass it through to TFMA." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "D2qa8Okwj_U3" - }, - "outputs": [], - "source": [ - "#@title Data Preprocessing functions for \n", - "def tfds_filepattern_for_split(dataset_name, split):\n", - " return f\"{local_test_file_full_prefix()}*\"\n", - "\n", - "class PreprocessCelebA(object):\n", - " \"\"\"Class that deserializes, decodes and applies additional preprocessing for CelebA input.\"\"\"\n", - " def __init__(self, dataset_name):\n", - " builder = tfds.builder(dataset_name)\n", - " self.features = builder.info.features\n", - " example_specs = self.features.get_serialized_info()\n", - " self.parser = tfds.core.example_parser.ExampleParser(example_specs)\n", - "\n", - " def __call__(self, serialized_example):\n", - " # Deserialize\n", - " deserialized_example = self.parser.parse_example(serialized_example)\n", - " # Decode\n", - " decoded_example = self.features.decode_example(deserialized_example)\n", - " # Additional preprocessing\n", - " image = decoded_example[IMAGE_KEY]\n", - " label = decoded_example[ATTR_KEY][LABEL_KEY]\n", - " # Resize and scale image.\n", - " image = tf.cast(image, tf.float32)\n", - " image = tf.image.resize(image, [IMAGE_SIZE, IMAGE_SIZE])\n", - " image /= 255.0\n", - " image = tf.reshape(image, [-1])\n", - " # Cast label and group to float32.\n", - " label = tf.cast(label, tf.float32)\n", - "\n", - " group = decoded_example[ATTR_KEY][GROUP_KEY]\n", - " \n", - " output = tf.train.Example()\n", - " output.features.feature[IMAGE_KEY].float_list.value.extend(image.numpy().tolist())\n", - " output.features.feature[LABEL_KEY].float_list.value.append(label.numpy())\n", - " output.features.feature[GROUP_KEY].bytes_list.value.append(b\"Young\" if group.numpy() else b'Not Young')\n", - " return output.SerializeToString()\n", - "\n", - "def tfds_as_pcollection(beam_pipeline, dataset_name, split):\n", - " return (\n", - " beam_pipeline\n", - " | 'Read records' \u003e\u003e beam.io.ReadFromTFRecord(tfds_filepattern_for_split(dataset_name, split))\n", - " | 'Preprocess' \u003e\u003e beam.Map(PreprocessCelebA(dataset_name))\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fBKvxd2Tz3hK" - }, - "source": [ - "Finally, we define a function that evaluates the results in TFMA." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "30YduitftaNB" - }, - "outputs": [], - "source": [ - "def get_eval_results(model_location, eval_subdir):\n", - " base_dir = tempfile.mkdtemp(prefix='saved_eval_results')\n", - " tfma_eval_result_path = os.path.join(base_dir, eval_subdir)\n", - "\n", - " eval_config_pbtxt = \"\"\"\n", - " model_specs {\n", - " label_key: \"%s\"\n", - " }\n", - " metrics_specs {\n", - " metrics {\n", - " class_name: \"FairnessIndicators\"\n", - " config: '{ \"thresholds\": [0.22, 0.5, 0.75] }'\n", - " }\n", - " metrics {\n", - " class_name: \"ExampleCount\"\n", - " }\n", - " }\n", - " slicing_specs {}\n", - " slicing_specs { feature_keys: \"%s\" }\n", - " options {\n", - " compute_confidence_intervals { value: False }\n", - " disabled_outputs{values: \"analysis\"}\n", - " }\n", - " \"\"\" % (LABEL_KEY, GROUP_KEY)\n", - " \n", - " eval_config = text_format.Parse(eval_config_pbtxt, tfma.EvalConfig())\n", - "\n", - " eval_shared_model = tfma.default_eval_shared_model(\n", - " eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING])\n", - "\n", - " schema_pbtxt = \"\"\"\n", - " tensor_representation_group {\n", - " key: \"\"\n", - " value {\n", - " tensor_representation {\n", - " key: \"%s\"\n", - " value {\n", - " dense_tensor {\n", - " column_name: \"%s\"\n", - " shape {\n", - " dim { size: 28 }\n", - " dim { size: 28 }\n", - " dim { size: 3 }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " }\n", - " feature {\n", - " name: \"%s\"\n", - " type: FLOAT\n", - " }\n", - " feature {\n", - " name: \"%s\"\n", - " type: FLOAT\n", - " }\n", - " feature {\n", - " name: \"%s\"\n", - " type: BYTES\n", - " }\n", - " \"\"\" % (IMAGE_KEY, IMAGE_KEY, IMAGE_KEY, LABEL_KEY, GROUP_KEY)\n", - " schema = text_format.Parse(schema_pbtxt, schema_pb2.Schema())\n", - " coder = tf_example_record.TFExampleBeamRecord(\n", - " physical_format='inmem', schema=schema,\n", - " raw_record_column_name=tfma.ARROW_INPUT_COLUMN)\n", - " tensor_adapter_config = tensor_adapter.TensorAdapterConfig(\n", - " arrow_schema=coder.ArrowSchema(),\n", - " tensor_representations=coder.TensorRepresentations())\n", - " # Run the fairness evaluation.\n", - " with beam.Pipeline() as pipeline:\n", - " _ = (\n", - " tfds_as_pcollection(pipeline, 'celeb_a', 'test')\n", - " | 'ExamplesToRecordBatch' \u003e\u003e coder.BeamSource()\n", - " | 'ExtractEvaluateAndWriteResults' \u003e\u003e\n", - " tfma.ExtractEvaluateAndWriteResults(\n", - " eval_config=eval_config,\n", - " eval_shared_model=eval_shared_model,\n", - " output_path=tfma_eval_result_path,\n", - " tensor_adapter_config=tensor_adapter_config)\n", - " )\n", - " return tfma.load_eval_result(output_path=tfma_eval_result_path)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "76tZ3vk-tyo9" - }, - "source": [ - "# Train \u0026 Evaluate Unconstrained Model\n", - "\n", - "With the model now defined and the input pipeline in place, we’re now ready to train our model. To cut back on the amount of execution time and memory, we will train the model by slicing the data into small batches with only a few repeated iterations.\n", - "\n", - "Note that running this notebook in TensorFlow \u003c 2.0.0 may result in a deprecation warning for `np.where`. Safely ignore this warning as TensorFlow addresses this in 2.X by using `tf.where` in place of `np.where`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3m9OOdU_8GWo" - }, - "outputs": [], - "source": [ - "BATCH_SIZE = 32\n", - "\n", - "# Set seeds to get reproducible results\n", - "set_seeds()\n", - "\n", - "model_unconstrained = create_model()\n", - "model_unconstrained.fit(celeb_a_train_data_wo_group(BATCH_SIZE), epochs=5, steps_per_epoch=1000)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nCtBH9DkvtUy" - }, - "source": [ - "Evaluating the model on the test data should result in a final accuracy score of just over 85%. Not bad for a simple model with no fine tuning." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mgsjbxpTIdZf" - }, - "outputs": [], - "source": [ - "print('Overall Results, Unconstrained')\n", - "celeb_a_test_data = celeb_a_builder.as_dataset(split='test').batch(1).map(preprocess_input_dict).map(get_image_label_and_group)\n", - "results = model_unconstrained.evaluate(celeb_a_test_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L5jslIrzwIKo" - }, - "source": [ - "However, performance evaluated across age groups may reveal some shortcomings.\n", - "\n", - "To explore this further, we evaluate the model with Fairness Indicators (via TFMA). In particular, we are interested in seeing whether there is a significant gap in performance between \"Young\" and \"Not Young\" categories when evaluated on false positive rate.\n", - "\n", - "A false positive error occurs when the model incorrectly predicts the positive class. In this context, a false positive outcome occurs when the ground truth is an image of a celebrity 'Not Smiling' and the model predicts 'Smiling'. By extension, the false positive rate, which is used in the visualization above, is a measure of accuracy for a test. While this is a relatively mundane error to make in this context, false positive errors can sometimes cause more problematic behaviors. For instance, a false positive error in a spam classifier could cause a user to miss an important email." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nFL91nZF1V8D" - }, - "outputs": [], - "source": [ - "model_location = save_model(model_unconstrained, 'model_export_unconstrained')\n", - "eval_results_unconstrained = get_eval_results(model_location, 'eval_results_unconstrained')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "34zHIMW0NHld" - }, - "source": [ - "As mentioned above, we are concentrating on the false positive rate. The current version of Fairness Indicators (0.1.2) selects false negative rate by default. After running the line below, deselect false_negative_rate and select false_positive_rate to look at the metric we are interested in." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KXMVmUMi0ydk" - }, - "outputs": [], - "source": [ - "tfma.addons.fairness.view.widget_view.render_fairness_indicator(eval_results_unconstrained)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zYVpZ-DpBsfD" - }, - "source": [ - "As the results show above, we do see a **disproportionate gap between \"Young\" and \"Not Young\" categories**.\n", - "\n", - "This is where TFCO can help by constraining the false positive rate to be within a more acceptable criterion.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZNnI_Eu70gVp" - }, - "source": [ - "# Constrained Model Set Up\n", - "As documented in [TFCO's library](https://github.com/google-research/tensorflow_constrained_optimization/blob/master/README.md), there are several helpers that will make it easier to constrain the problem:\n", - "\n", - "1. `tfco.rate_context()` – This is what will be used in constructing a constraint for each age group category.\n", - "2. `tfco.RateMinimizationProblem()`– The rate expression to be minimized here will be the false positive rate subject to age group. In other words, performance now will be evaluated based on the difference between the false positive rates of the age group and that of the overall dataset. For this demonstration, a false positive rate of less than or equal to 5% will be set as the constraint.\n", - "3. `tfco.ProxyLagrangianOptimizerV2()` – This is the helper that will actually solve the rate constraint problem.\n", - "\n", - "The cell below will call on these helpers to set up model training with the fairness constraint.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BTukzvfD6iWr" - }, - "outputs": [], - "source": [ - "# The batch size is needed to create the input, labels and group tensors.\n", - "# These tensors are initialized with all 0's. They will eventually be assigned\n", - "# the batch content to them. A large batch size is chosen so that there are\n", - "# enough number of \"Young\" and \"Not Young\" examples in each batch.\n", - "set_seeds()\n", - "model_constrained = create_model()\n", - "BATCH_SIZE = 32\n", - "\n", - "# Create input tensor.\n", - "input_tensor = tf.Variable(\n", - " np.zeros((BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3), dtype=\"float32\"),\n", - " name=\"input\")\n", - "\n", - "# Create labels and group tensors (assuming both labels and groups are binary).\n", - "labels_tensor = tf.Variable(\n", - " np.zeros(BATCH_SIZE, dtype=\"float32\"), name=\"labels\")\n", - "groups_tensor = tf.Variable(\n", - " np.zeros(BATCH_SIZE, dtype=\"float32\"), name=\"groups\")\n", - "\n", - "# Create a function that returns the applied 'model' to the input tensor\n", - "# and generates constrained predictions.\n", - "def predictions():\n", - " return model_constrained(input_tensor)\n", - "\n", - "# Create overall context and subsetted context.\n", - "# The subsetted context contains subset of examples where group attribute \u003c 1\n", - "# (i.e. the subset of \"Not Young\" celebrity images).\n", - "# \"groups_tensor \u003c 1\" is used instead of \"groups_tensor == 0\" as the former\n", - "# would be a comparison on the tensor value, while the latter would be a\n", - "# comparison on the Tensor object.\n", - "context = tfco.rate_context(predictions, labels=lambda:labels_tensor)\n", - "context_subset = context.subset(lambda:groups_tensor \u003c 1)\n", - "\n", - "# Setup list of constraints.\n", - "# In this notebook, the constraint will just be: FPR to less or equal to 5%.\n", - "constraints = [tfco.false_positive_rate(context_subset) \u003c= 0.05]\n", - "\n", - "# Setup rate minimization problem: minimize overall error rate s.t. constraints.\n", - "problem = tfco.RateMinimizationProblem(tfco.error_rate(context), constraints)\n", - "\n", - "# Create constrained optimizer and obtain train_op.\n", - "# Separate optimizers are specified for the objective and constraints\n", - "optimizer = tfco.ProxyLagrangianOptimizerV2(\n", - " optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),\n", - " constraint_optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),\n", - " num_constraints=problem.num_constraints)\n", - "\n", - "# A list of all trainable variables is also needed to use TFCO.\n", - "var_list = (model_constrained.trainable_weights + list(problem.trainable_variables) +\n", - " optimizer.trainable_variables())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "thEe8A8UYbrO" - }, - "source": [ - "The model is now set up and ready to be trained with the false positive rate constraint across age group.\n", - "\n", - "Now, because the last iteration of the constrained model may not necessarily be the best performing model in terms of the defined constraint, the TFCO library comes equipped with `tfco.find_best_candidate_index()` that can help choose the best iterate out of the ones found after each epoch. Think of `tfco.find_best_candidate_index()` as an added heuristic that ranks each of the outcomes based on accuracy and fairness constraint (in this case, false positive rate across age group) separately with respect to the training data. That way, it can search for a better trade-off between overall accuracy and the fairness constraint.\n", - "\n", - "The following cells will start the training with constraints while also finding the best performing model per iteration." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "73doG4HL6nPS" - }, - "outputs": [], - "source": [ - "# Obtain train set batches.\n", - "\n", - "NUM_ITERATIONS = 100 # Number of training iterations.\n", - "SKIP_ITERATIONS = 10 # Print training stats once in this many iterations.\n", - "\n", - "# Create temp directory for saving snapshots of models.\n", - "temp_directory = tempfile.mktemp()\n", - "os.mkdir(temp_directory)\n", - "\n", - "# List of objective and constraints across iterations.\n", - "objective_list = []\n", - "violations_list = []\n", - "\n", - "# Training iterations.\n", - "iteration_count = 0\n", - "for (image, label, group) in celeb_a_train_data_w_group(BATCH_SIZE):\n", - " # Assign current batch to input, labels and groups tensors.\n", - " input_tensor.assign(image)\n", - " labels_tensor.assign(label)\n", - " groups_tensor.assign(group)\n", - "\n", - " # Run gradient update.\n", - " optimizer.minimize(problem, var_list=var_list)\n", - "\n", - " # Record objective and violations.\n", - " objective = problem.objective()\n", - " violations = problem.constraints()\n", - "\n", - " sys.stdout.write(\n", - " \"\\r Iteration %d: Hinge Loss = %.3f, Max. Constraint Violation = %.3f\"\n", - " % (iteration_count + 1, objective, max(violations)))\n", - "\n", - " # Snapshot model once in SKIP_ITERATIONS iterations.\n", - " if iteration_count % SKIP_ITERATIONS == 0:\n", - " objective_list.append(objective)\n", - " violations_list.append(violations)\n", - "\n", - " # Save snapshot of model weights.\n", - " model_constrained.save_weights(\n", - " temp_directory + \"/celeb_a_constrained_\" +\n", - " str(iteration_count / SKIP_ITERATIONS) + \".h5\")\n", - "\n", - " iteration_count += 1\n", - " if iteration_count \u003e= NUM_ITERATIONS:\n", - " break\n", - "\n", - "# Choose best model from recorded iterates and load that model.\n", - "best_index = tfco.find_best_candidate_index(\n", - " np.array(objective_list), np.array(violations_list))\n", - "\n", - "model_constrained.load_weights(\n", - " temp_directory + \"/celeb_a_constrained_\" + str(best_index) + \".0.h5\")\n", - "\n", - "# Remove temp directory.\n", - "os.system(\"rm -r \" + temp_directory)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6r-6_R_gSrsT" - }, - "source": [ - "After having applied the constraint, we evaluate the results once again using Fairness Indicators." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5G6B3OR9CUmo" - }, - "outputs": [], - "source": [ - "model_location = save_model(model_constrained, 'model_export_constrained')\n", - "eval_result_constrained = get_eval_results(model_location, 'eval_results_constrained')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sVteOnE80ATS" - }, - "source": [ - "As with the previous time we used Fairness Indicators, deselect false_negative_rate and select false_positive_rate to look at the metric we are interested in.\n", - "\n", - "Note that to fairly compare the two versions of our model, it is important to use thresholds that set the overall false positive rate to be roughly equal. This ensures that we are looking at actual change as opposed to just a shift in the model equivalent to simply moving the threshold boundary. In our case, comparing the unconstrained model at 0.5 and the constrained model at 0.22 provides a fair comparison for the models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GRIjYftvuc7b" - }, - "outputs": [], - "source": [ - "eval_results_dict = {\n", - " 'constrained': eval_result_constrained,\n", - " 'unconstrained': eval_results_unconstrained,\n", - "}\n", - "tfma.addons.fairness.view.widget_view.render_fairness_indicator(multi_eval_results=eval_results_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lrT-7EBrcBvV" - }, - "source": [ - "With TFCO's ability to express a more complex requirement as a rate constraint, we helped this model achieve a more desirable outcome with little impact to the overall performance. There is, of course, still room for improvement, but at least TFCO was able to find a model that gets close to satisfying the constraint and reduces the disparity between the groups as much as possible." - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Fairness Indicators TFCO CelebA Case Study.ipynb", - "private_outputs": true, - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/g3doc/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb b/g3doc/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb deleted file mode 100644 index 2065a253..00000000 --- a/g3doc/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb +++ /dev/null @@ -1,1101 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "jMqk3Z8EciF8" - }, - "source": [ - "##### Copyright 2020 The TensorFlow Authors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XbpNOB-vJVKu" - }, - "outputs": [], - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bqdaOVRxWs8v" - }, - "source": [ - "# Wiki Talk Comments Toxicity Prediction" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EG_KEDkodWsT" - }, - "source": [ - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/fairness-indicators/tree/master/g3doc/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/fairness-indicators/g3doc/tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y6T5tlXcdW7J" - }, - "source": [ - "In this example, we consider the task of predicting whether a discussion comment posted on a Wiki talk page contains toxic content (i.e. contains content that is “rude, disrespectful or unreasonable”). We use a public \u003ca href=\"https://figshare.com/articles/Wikipedia_Talk_Labels_Toxicity/4563973\"\u003edataset\u003c/a\u003e released by the \u003ca href=\"https://conversationai.github.io/\"\u003eConversation AI\u003c/a\u003e project, which contains over 100k comments from the English Wikipedia that are annotated by crowd workers (see [paper](https://arxiv.org/pdf/1610.08914.pdf) for labeling methodology).\n", - "\n", - "One of the challenges with this dataset is that a very small proportion of the comments cover sensitive topics such as sexuality or religion. As such, training a neural network model on this dataset leads to disparate performance on the smaller sensitive topics. This can mean that innocuous statements about those topics might get incorrectly flagged as ‘toxic’ at higher rates, causing speech to be unfairly censored\n", - "\n", - "By imposing constraints during training, we can train a *fairer* model that performs more equitably across the different topic groups. \n", - "\n", - "We will use the TFCO library to optimize for our fairness goal during training." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DG_C2gsAKV7x" - }, - "source": [ - "## Installation\n", - "\n", - "Let's first install and import the relevant libraries. Note that you may have to restart your colab once after running the first cell because of outdated packages in the runtime. After doing so, there should be no further issues with imports." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0XOLn8Pyrc_s" - }, - "outputs": [], - "source": [ - "#@title pip installs\n", - "!pip install git+https://github.com/google-research/tensorflow_constrained_optimization\n", - "!pip install git+https://github.com/tensorflow/fairness-indicators" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2ZkQDo2xcDXU" - }, - "source": [ - "Note that depending on when you run the cell below, you may receive a warning about the default version of TensorFlow in Colab switching to TensorFlow 2.X soon. You can safely ignore that warning as this notebook was designed to be compatible with TensorFlow 1.X and 2.X." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "nd_Y6CTnWs8w" - }, - "outputs": [], - "source": [ - "#@title Import Modules\n", - "import io\n", - "import os\n", - "import shutil\n", - "import sys\n", - "import tempfile\n", - "import time\n", - "import urllib\n", - "import zipfile\n", - "\n", - "import apache_beam as beam\n", - "from IPython.display import display\n", - "from IPython.display import HTML\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import tensorflow as tf\n", - "import tensorflow.keras as keras\n", - "from tensorflow.keras import layers\n", - "from tensorflow.keras.preprocessing import sequence\n", - "from tensorflow.keras.preprocessing import text\n", - "import tensorflow_constrained_optimization as tfco\n", - "import tensorflow_model_analysis as tfma\n", - "import fairness_indicators as fi\n", - "from tensorflow_model_analysis.addons.fairness.view import widget_view\n", - "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_evaluate_graph\n", - "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_extractor\n", - "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_predict as agnostic_predict" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GvqR564dLEVa" - }, - "source": [ - "Though TFCO is compatible with eager and graph execution, this notebook assumes that eager execution is enabled by default. To ensure that nothing breaks, eager execution will be enabled in the cell below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "avMBqzjWct4Z" - }, - "outputs": [], - "source": [ - "#@title Enable Eager Execution and Print Versions\n", - "if tf.__version__ \u003c \"2.0.0\":\n", - " tf.enable_eager_execution()\n", - " print(\"Eager execution enabled.\")\n", - "else:\n", - " print(\"Eager execution enabled by default.\")\n", - "\n", - "print(\"TensorFlow \" + tf.__version__)\n", - "print(\"TFMA \" + tfma.__version__)\n", - "print(\"FI \" + fi.version.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YUJyWaAwWs83" - }, - "source": [ - "## Hyper-parameters\n", - "\n", - "First, we set some hyper-parameters needed for the data preprocessing and model training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1aXlwlqTWs84" - }, - "outputs": [], - "source": [ - "hparams = {\n", - " \"batch_size\": 128,\n", - " \"cnn_filter_sizes\": [128, 128, 128],\n", - " \"cnn_kernel_sizes\": [5, 5, 5],\n", - " \"cnn_pooling_sizes\": [5, 5, 40],\n", - " \"constraint_learning_rate\": 0.01,\n", - " \"embedding_dim\": 100,\n", - " \"embedding_trainable\": False,\n", - " \"learning_rate\": 0.005,\n", - " \"max_num_words\": 10000,\n", - " \"max_sequence_length\": 250\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0PMs8Iwxq98C" - }, - "source": [ - "## Load and pre-process dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DIe2JRDeWs87" - }, - "source": [ - "Next, we download the dataset and preprocess it. The train, test and validation sets are provided as separate CSV files." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rcd2CV7pWs88" - }, - "outputs": [], - "source": [ - "toxicity_data_url = (\"https://github.com/conversationai/unintended-ml-bias-analysis/\"\n", - " \"raw/e02b9f12b63a39235e57ba6d3d62d8139ca5572c/data/\")\n", - "\n", - "data_train = pd.read_csv(toxicity_data_url + \"wiki_train.csv\")\n", - "data_test = pd.read_csv(toxicity_data_url + \"wiki_test.csv\")\n", - "data_vali = pd.read_csv(toxicity_data_url + \"wiki_dev.csv\")\n", - "\n", - "data_train.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ojo617RIWs8_" - }, - "source": [ - "The `comment` column contains the discussion comments and `is_toxic` column indicates whether or not a comment is annotated as toxic. \n", - "\n", - "In the following, we:\n", - "1. Separate out the labels\n", - "2. Tokenize the text comments\n", - "3. Identify comments that contain sensitive topic terms \n", - "\n", - "First, we separate the labels from the train, test and validation sets. The labels are all binary (0 or 1)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mxo7ny90Ws9A" - }, - "outputs": [], - "source": [ - "labels_train = data_train[\"is_toxic\"].values.reshape(-1, 1) * 1.0\n", - "labels_test = data_test[\"is_toxic\"].values.reshape(-1, 1) * 1.0\n", - "labels_vali = data_vali[\"is_toxic\"].values.reshape(-1, 1) * 1.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "alrWi6jUWs9C" - }, - "source": [ - "Next, we tokenize the textual comments using the `Tokenizer` provided by `Keras`. We use the training set comments alone to build a vocabulary of tokens, and use them to convert all the comments into a (padded) sequence of tokens of the same length." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yvOTBsrHWs9D" - }, - "outputs": [], - "source": [ - "tokenizer = text.Tokenizer(num_words=hparams[\"max_num_words\"])\n", - "tokenizer.fit_on_texts(data_train[\"comment\"])\n", - "\n", - "def prep_text(texts, tokenizer, max_sequence_length):\n", - " # Turns text into into padded sequences.\n", - " text_sequences = tokenizer.texts_to_sequences(texts)\n", - " return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length)\n", - "\n", - "text_train = prep_text(data_train[\"comment\"], tokenizer, hparams[\"max_sequence_length\"])\n", - "text_test = prep_text(data_test[\"comment\"], tokenizer, hparams[\"max_sequence_length\"])\n", - "text_vali = prep_text(data_vali[\"comment\"], tokenizer, hparams[\"max_sequence_length\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Cn5zbgp-Ws9F" - }, - "source": [ - "Finally, we identify comments related to certain sensitive topic groups. We consider a subset of the \u003ca href=\"https://github.com/conversationai/unintended-ml-bias-analysis/blob/master/unintended_ml_bias/bias_madlibs_data/adjectives_people.txt\"\u003eidentity terms\u003c/a\u003e provided with the dataset and group them into\n", - "four broad topic groups: *sexuality*, *gender identity*, *religion*, and *race*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EnFfV2gEWs9G" - }, - "outputs": [], - "source": [ - "terms = {\n", - " 'sexuality': ['gay', 'lesbian', 'bisexual', 'homosexual', 'straight', 'heterosexual'], \n", - " 'gender identity': ['trans', 'transgender', 'cis', 'nonbinary'],\n", - " 'religion': ['christian', 'muslim', 'jewish', 'buddhist', 'catholic', 'protestant', 'sikh', 'taoist'],\n", - " 'race': ['african', 'african american', 'black', 'white', 'european', 'hispanic', 'latino', 'latina', \n", - " 'latinx', 'mexican', 'canadian', 'american', 'asian', 'indian', 'middle eastern', 'chinese', \n", - " 'japanese']}\n", - "\n", - "group_names = list(terms.keys())\n", - "num_groups = len(group_names)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ooI3F5M4Ws9I" - }, - "source": [ - "We then create separate group membership matrices for the train, test and validation sets, where the rows correspond to comments, the columns correspond to the four sensitive groups, and each entry is a boolean indicating whether the comment contains a term from the topic group." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zO7PyNckWs9J" - }, - "outputs": [], - "source": [ - "def get_groups(text):\n", - " # Returns a boolean NumPy array of shape (n, k), where n is the number of comments, \n", - " # and k is the number of groups. Each entry (i, j) indicates if the i-th comment \n", - " # contains a term from the j-th group.\n", - " groups = np.zeros((text.shape[0], num_groups))\n", - " for ii in range(num_groups):\n", - " groups[:, ii] = text.str.contains('|'.join(terms[group_names[ii]]), case=False)\n", - " return groups\n", - "\n", - "groups_train = get_groups(data_train[\"comment\"])\n", - "groups_test = get_groups(data_test[\"comment\"])\n", - "groups_vali = get_groups(data_vali[\"comment\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GFAI6AB9Ws9L" - }, - "source": [ - "As shown below, all four topic groups constitute only a small fraction of the overall dataset, and have varying proportions of toxic comments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8Ug4u_P9Ws9M" - }, - "outputs": [], - "source": [ - "print(\"Overall label proportion = %.1f%%\" % (labels_train.mean() * 100))\n", - "\n", - "group_stats = []\n", - "for ii in range(num_groups):\n", - " group_proportion = groups_train[:, ii].mean()\n", - " group_pos_proportion = labels_train[groups_train[:, ii] == 1].mean()\n", - " group_stats.append([group_names[ii],\n", - " \"%.2f%%\" % (group_proportion * 100), \n", - " \"%.1f%%\" % (group_pos_proportion * 100)])\n", - "group_stats = pd.DataFrame(group_stats, \n", - " columns=[\"Topic group\", \"Group proportion\", \"Label proportion\"])\n", - "group_stats" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aG5ZKKrVWs9O" - }, - "source": [ - "We see that only 1.3% of the dataset contains comments related to sexuality. Among them, 37% of the comments have been annotated as being toxic. Note that this is significantly larger than the overall proportion of comments annotated as toxic. This could be because the few comments that used those identity terms did so in pejorative contexts. As mentioned above, this could cause our model to disporportionately misclassify comments as toxic when they include those terms. Since this is the concern, we'll make sure to look at the **False Positive Rate** when we evaluate the model's performance." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5DkJpKaLWs9P" - }, - "source": [ - "## Build CNN toxicity prediction model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "niJ4KIJgWs9Q" - }, - "source": [ - "Having prepared the dataset, we now build a `Keras` model for prediction toxicity. The model we use is a convolutional neural network (CNN) with the same architecture used by the Conversation AI project for their debiasing analysis. We adapt \u003ca href=\"https://github.com/conversationai/unintended-ml-bias-analysis/blob/master/unintended_ml_bias/model_tool.py\"\u003ecode\u003c/a\u003e provided by them to construct the model layers.\n", - "\n", - "The model uses an embedding layer to convert the text tokens to fixed-length vectors. This layer converts the input text sequence into a sequence of vectors, and passes them through several layers of convolution and pooling operations, followed by a final fully-connected layer.\n", - "\n", - "We make use of pre-trained GloVe word vector embeddings, which we download below. This may take a few minutes to complete." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yevbBL2oWs9Q" - }, - "outputs": [], - "source": [ - "zip_file_url = \"http://nlp.stanford.edu/data/glove.6B.zip\"\n", - "zip_file = urllib.request.urlopen(zip_file_url)\n", - "archive = zipfile.ZipFile(io.BytesIO(zip_file.read()))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a11-YWDnWs9S" - }, - "source": [ - "We use the downloaded GloVe embeddings to create an embedding matrix, where the rows contain the word embeddings for the tokens in the `Tokenizer`'s vocabulary. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bBS74MMYWs9T" - }, - "outputs": [], - "source": [ - "embeddings_index = {}\n", - "glove_file = \"glove.6B.100d.txt\"\n", - "\n", - "with archive.open(glove_file) as f:\n", - " for line in f:\n", - " values = line.split()\n", - " word = values[0].decode(\"utf-8\") \n", - " coefs = np.asarray(values[1:], dtype=\"float32\")\n", - " embeddings_index[word] = coefs\n", - "\n", - "embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, hparams[\"embedding_dim\"]))\n", - "num_words_in_embedding = 0\n", - "for word, i in tokenizer.word_index.items():\n", - " embedding_vector = embeddings_index.get(word)\n", - " if embedding_vector is not None:\n", - " num_words_in_embedding += 1\n", - " embedding_matrix[i] = embedding_vector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "t9NVp-_eWs9V" - }, - "source": [ - "We are now ready to specify the `Keras` layers. We write a function to create a new model, which we will invoke whenever we wish to train a new model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_f_DhA6OWs9W" - }, - "outputs": [], - "source": [ - "def create_model():\n", - " model = keras.Sequential()\n", - "\n", - " # Embedding layer.\n", - " embedding_layer = layers.Embedding(\n", - " embedding_matrix.shape[0],\n", - " embedding_matrix.shape[1],\n", - " weights=[embedding_matrix],\n", - " input_length=hparams[\"max_sequence_length\"],\n", - " trainable=hparams['embedding_trainable'])\n", - " model.add(embedding_layer)\n", - "\n", - " # Convolution layers.\n", - " for filter_size, kernel_size, pool_size in zip(\n", - " hparams['cnn_filter_sizes'], hparams['cnn_kernel_sizes'],\n", - " hparams['cnn_pooling_sizes']):\n", - "\n", - " conv_layer = layers.Conv1D(\n", - " filter_size, kernel_size, activation='relu', padding='same')\n", - " model.add(conv_layer)\n", - "\n", - " pooled_layer = layers.MaxPooling1D(pool_size, padding='same')\n", - " model.add(pooled_layer)\n", - "\n", - " # Add a flatten layer, a fully-connected layer and an output layer.\n", - " model.add(layers.Flatten())\n", - " model.add(layers.Dense(128, activation='relu'))\n", - " model.add(layers.Dense(1))\n", - " \n", - " return model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CwcqYITBN7bW" - }, - "source": [ - "We also define a method to set random seeds. This is done to ensure reproducible results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "C_1nsXntN98C" - }, - "outputs": [], - "source": [ - "def set_seeds():\n", - " np.random.seed(121212)\n", - " tf.compat.v1.set_random_seed(212121)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "X-_fKjDtWs9Y" - }, - "source": [ - "## Fairness indicators" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "k009haGaWs9Z" - }, - "source": [ - "We also write functions to plot fairness indicators." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "B9ZgGCAs8V-I" - }, - "outputs": [], - "source": [ - "def create_examples(labels, predictions, groups, group_names):\n", - " # Returns tf.examples with given labels, predictions, and group information. \n", - " examples = []\n", - " sigmoid = lambda x: 1/(1 + np.exp(-x)) \n", - " for ii in range(labels.shape[0]):\n", - " example = tf.train.Example()\n", - " example.features.feature['toxicity'].float_list.value.append(\n", - " labels[ii][0])\n", - " example.features.feature['prediction'].float_list.value.append(\n", - " sigmoid(predictions[ii][0])) # predictions need to be in [0, 1].\n", - " for jj in range(groups.shape[1]):\n", - " example.features.feature[group_names[jj]].bytes_list.value.append(\n", - " b'Yes' if groups[ii, jj] else b'No')\n", - " examples.append(example)\n", - " return examples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vESL-3dU9iiG" - }, - "outputs": [], - "source": [ - "def evaluate_results(labels, predictions, groups, group_names):\n", - " # Evaluates fairness indicators for given labels, predictions and group\n", - " # membership info.\n", - " examples = create_examples(labels, predictions, groups, group_names)\n", - "\n", - " # Create feature map for labels, predictions and each group.\n", - " feature_map = {\n", - " 'prediction': tf.io.FixedLenFeature([], tf.float32),\n", - " 'toxicity': tf.io.FixedLenFeature([], tf.float32),\n", - " }\n", - " for group in group_names:\n", - " feature_map[group] = tf.io.FixedLenFeature([], tf.string)\n", - "\n", - " # Serialize the examples.\n", - " serialized_examples = [e.SerializeToString() for e in examples]\n", - "\n", - " BASE_DIR = tempfile.gettempdir()\n", - " OUTPUT_DIR = os.path.join(BASE_DIR, 'output')\n", - "\n", - " with beam.Pipeline() as pipeline:\n", - " model_agnostic_config = agnostic_predict.ModelAgnosticConfig(\n", - " label_keys=['toxicity'],\n", - " prediction_keys=['prediction'],\n", - " feature_spec=feature_map)\n", - " \n", - " slices = [tfma.slicer.SingleSliceSpec()]\n", - " for group in group_names:\n", - " slices.append(\n", - " tfma.slicer.SingleSliceSpec(columns=[group]))\n", - "\n", - " extractors = [\n", - " model_agnostic_extractor.ModelAgnosticExtractor(\n", - " model_agnostic_config=model_agnostic_config),\n", - " tfma.extractors.slice_key_extractor.SliceKeyExtractor(slices)\n", - " ]\n", - "\n", - " metrics_callbacks = [\n", - " tfma.post_export_metrics.fairness_indicators(\n", - " thresholds=[0.5],\n", - " target_prediction_keys=['prediction'],\n", - " labels_key='toxicity'),\n", - " tfma.post_export_metrics.example_count()]\n", - "\n", - " # Create a model agnostic aggregator.\n", - " eval_shared_model = tfma.types.EvalSharedModel(\n", - " add_metrics_callbacks=metrics_callbacks,\n", - " construct_fn=model_agnostic_evaluate_graph.make_construct_fn(\n", - " add_metrics_callbacks=metrics_callbacks,\n", - " config=model_agnostic_config))\n", - "\n", - " # Run Model Agnostic Eval.\n", - " _ = (\n", - " pipeline\n", - " | beam.Create(serialized_examples)\n", - " | 'ExtractEvaluateAndWriteResults' \u003e\u003e\n", - " tfma.ExtractEvaluateAndWriteResults(\n", - " eval_shared_model=eval_shared_model,\n", - " output_path=OUTPUT_DIR,\n", - " extractors=extractors,\n", - " compute_confidence_intervals=True\n", - " )\n", - " )\n", - "\n", - " fairness_ind_result = tfma.load_eval_result(output_path=OUTPUT_DIR)\n", - "\n", - " # Also evaluate accuracy of the model.\n", - " accuracy = np.mean(labels == (predictions \u003e 0.0))\n", - "\n", - " return fairness_ind_result, accuracy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "W3Sp7mpsWs9f" - }, - "outputs": [], - "source": [ - "def plot_fairness_indicators(eval_result, title):\n", - " fairness_ind_result, accuracy = eval_result\n", - " display(HTML(\"\u003ccenter\u003e\u003ch2\u003e\" + title + \n", - " \" (Accuracy = %.2f%%)\" % (accuracy * 100) + \"\u003c/h2\u003e\u003c/center\u003e\"))\n", - " widget_view.render_fairness_indicator(fairness_ind_result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WqLdtgI42fxb" - }, - "outputs": [], - "source": [ - "def plot_multi_fairness_indicators(multi_eval_results):\n", - " \n", - " multi_results = {}\n", - " multi_accuracy = {}\n", - " for title, (fairness_ind_result, accuracy) in multi_eval_results.items():\n", - " multi_results[title] = fairness_ind_result\n", - " multi_accuracy[title] = accuracy\n", - " \n", - " title_str = \"\u003ccenter\u003e\u003ch2\u003e\"\n", - " for title in multi_eval_results.keys():\n", - " title_str+=title + \" (Accuracy = %.2f%%)\" % (multi_accuracy[title] * 100) + \"; \"\n", - " title_str=title_str[:-2]\n", - " title_str+=\"\u003c/h2\u003e\u003c/center\u003e\"\n", - " # fairness_ind_result, accuracy = eval_result\n", - " display(HTML(title_str))\n", - " widget_view.render_fairness_indicator(multi_eval_results=multi_results)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8aWNc4CdWs9h" - }, - "source": [ - "## Train unconstrained model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DuSA8qL7Ws9i" - }, - "source": [ - "For the first model we train, we optimize a simple cross-entropy loss *without* any constraints.." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0g50bauHWs9j" - }, - "outputs": [], - "source": [ - "# Set random seed for reproducible results.\n", - "set_seeds()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YsCoHMG_iIzc" - }, - "source": [ - "**Note**: The following code cell can take ~8 minutes to run." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tamJiG3FiDYW" - }, - "outputs": [], - "source": [ - "# Optimizer and loss.\n", - "optimizer = tf.keras.optimizers.Adam(learning_rate=hparams[\"learning_rate\"])\n", - "loss = lambda y_true, y_pred: tf.keras.losses.binary_crossentropy(\n", - " y_true, y_pred, from_logits=True)\n", - "\n", - "# Create, compile and fit model.\n", - "model_unconstrained = create_model()\n", - "model_unconstrained.compile(optimizer=optimizer, loss=loss)\n", - "\n", - "model_unconstrained.fit(\n", - " x=text_train, y=labels_train, batch_size=hparams[\"batch_size\"], epochs=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "p7AvIdktWs9t" - }, - "source": [ - "Having trained the unconstrained model, we plot various evaluation metrics for the model on the test set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tHV40_21lRL6" - }, - "outputs": [], - "source": [ - "scores_unconstrained_test = model_unconstrained.predict(text_test)\n", - "eval_result_unconstrained = evaluate_results(\n", - " labels_test, scores_unconstrained_test, groups_test, group_names)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AJpRuN0EOeyG" - }, - "source": [ - "As explained above, we are concentrating on the false positive rate. In their current version (0.1.2), Fairness Indicators select false negative rate by default. After running the line below, go ahead and deselect false_negative_rate and select false_positive_rate to look at the metric we are interested in." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2fwNpfou4yvP" - }, - "outputs": [], - "source": [ - "plot_fairness_indicators(eval_result_unconstrained, \"Unconstrained\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "J3TbAenkGM7P" - }, - "source": [ - "While the overall false positive rate is less than 2%, the false positive rate on the sexuality-related comments is significantly higher. This is because the sexuality group is very small in size, and has a disproportionately higher fraction of comments annotated as toxic. Hence, training a model without constraints results in the model believing that sexuality-related terms are a strong indicator of toxicity." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KmxyAo9hWs9w" - }, - "source": [ - "## Train with constraints on false positive rates" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l3dYUchIWs9w" - }, - "source": [ - "To avoid large differences in false positive rates across different groups, we \n", - "next train a model by constraining the false positive rates for each group to be within a desired limit. In this case, we will optimize the error rate of the model subject to the *per-group false positive rates being lesser or equal to 2%*.\n", - "\n", - "Training on minibatches with per-group constraints can be challenging for this dataset, however, as the groups we wish to constraint are all small in size, and it's likely that the individual minibatches contain very few examples from each group. Hence the gradients we compute during training will be noisy, and result in the model converging very slowly. \n", - "\n", - "To mitigate this problem, we recommend using two streams of minibatches, with the first stream formed as before from the entire training set, and the second stream formed solely from the sensitive group examples. We will compute the objective using minibatches from the first stream and the per-group constraints using minibatches from the second stream. Because the batches from the second stream are likely to contain a larger number of examples from each group, we expect our updates to be less noisy.\n", - "\n", - "We create separate features, labels and groups tensors to hold the minibatches from the two streams." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vMuuTOEOWs9x" - }, - "outputs": [], - "source": [ - "# Set random seed.\n", - "set_seeds()\n", - "\n", - "# Features tensors.\n", - "batch_shape = (hparams[\"batch_size\"], hparams['max_sequence_length'])\n", - "features_tensor = tf.Variable(np.zeros(batch_shape, dtype='int32'), name='x')\n", - "features_tensor_sen = tf.Variable(np.zeros(batch_shape, dtype='int32'), name='x_sen')\n", - "\n", - "# Labels tensors.\n", - "batch_shape = (hparams[\"batch_size\"], 1)\n", - "labels_tensor = tf.Variable(np.zeros(batch_shape, dtype='float32'), name='labels')\n", - "labels_tensor_sen = tf.Variable(np.zeros(batch_shape, dtype='float32'), name='labels_sen')\n", - "\n", - "# Groups tensors.\n", - "batch_shape = (hparams[\"batch_size\"], num_groups)\n", - "groups_tensor_sen = tf.Variable(np.zeros(batch_shape, dtype='float32'), name='groups_sen')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-wh26V7nWs9z" - }, - "source": [ - "We instantiate a new model, and compute predictions for minibatches from the two streams." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kawyrkQIWs9z" - }, - "outputs": [], - "source": [ - "# Create model, and separate prediction functions for the two streams. \n", - "# For the predictions, we use a nullary function returning a Tensor to support eager mode.\n", - "model_constrained = create_model()\n", - "\n", - "def predictions():\n", - " return model_constrained(features_tensor)\n", - "\n", - "def predictions_sen():\n", - " return model_constrained(features_tensor_sen)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UG9t7dw1Ws91" - }, - "source": [ - "We then set up a constrained optimization problem with the error rate as the objective and with constraints on the per-group false positive rate." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EhKAMGSJWs93" - }, - "outputs": [], - "source": [ - "epsilon = 0.02 # Desired false-positive rate threshold.\n", - "\n", - "# Set up separate contexts for the two minibatch streams.\n", - "context = tfco.rate_context(predictions, lambda:labels_tensor)\n", - "context_sen = tfco.rate_context(predictions_sen, lambda:labels_tensor_sen)\n", - "\n", - "# Compute the objective using the first stream.\n", - "objective = tfco.error_rate(context)\n", - "\n", - "# Compute the constraint using the second stream.\n", - "# Subset the examples belonging to the \"sexuality\" group from the second stream \n", - "# and add a constraint on the group's false positive rate.\n", - "context_sen_subset = context_sen.subset(lambda: groups_tensor_sen[:, 0] \u003e 0)\n", - "constraint = [tfco.false_positive_rate(context_sen_subset) \u003c= epsilon]\n", - "\n", - "# Create a rate minimization problem.\n", - "problem = tfco.RateMinimizationProblem(objective, constraint)\n", - "\n", - "# Set up a constrained optimizer.\n", - "optimizer = tfco.ProxyLagrangianOptimizerV2(\n", - " optimizer=tf.keras.optimizers.Adam(learning_rate=hparams[\"learning_rate\"]),\n", - " num_constraints=problem.num_constraints)\n", - "\n", - "# List of variables to optimize include the model weights, \n", - "# and the trainable variables from the rate minimization problem and \n", - "# the constrained optimizer.\n", - "var_list = (model_constrained.trainable_weights + list(problem.trainable_variables) +\n", - " optimizer.trainable_variables())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CoFWd8wMWs94" - }, - "source": [ - "We are ready to train the model. We maintain a separate counter for the two minibatch streams. Every time we perform a gradient update, we will have to copy the minibatch contents from the first stream to the tensors `features_tensor` and `labels_tensor`, and the minibatch contents from the second stream to the tensors `features_tensor_sen`, `labels_tensor_sen` and `groups_tensor_sen`.\n", - "\n", - "**Note**: The following code cell may take ~12 minutes to run." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zbXohC6vWs95" - }, - "outputs": [], - "source": [ - "# Indices of sensitive group members.\n", - "protected_group_indices = np.nonzero(groups_train.sum(axis=1))[0]\n", - "\n", - "num_examples = text_train.shape[0]\n", - "num_examples_sen = protected_group_indices.shape[0]\n", - "batch_size = hparams[\"batch_size\"]\n", - "\n", - "# Number of steps needed for one epoch over the training sample.\n", - "num_steps = int(num_examples / batch_size)\n", - "\n", - "start_time = time.time()\n", - "\n", - "# Loop over minibatches.\n", - "for batch_index in range(num_steps):\n", - " # Indices for current minibatch in the first stream.\n", - " batch_indices = np.arange(\n", - " batch_index * batch_size, (batch_index + 1) * batch_size)\n", - " batch_indices = [ind % num_examples for ind in batch_indices]\n", - "\n", - " # Indices for current minibatch in the second stream.\n", - " batch_indices_sen = np.arange(\n", - " batch_index * batch_size, (batch_index + 1) * batch_size)\n", - " batch_indices_sen = [protected_group_indices[ind % num_examples_sen]\n", - " for ind in batch_indices_sen]\n", - "\n", - " # Assign features, labels, groups from the minibatches to the respective tensors.\n", - " features_tensor.assign(text_train[batch_indices, :])\n", - " labels_tensor.assign(labels_train[batch_indices])\n", - "\n", - " features_tensor_sen.assign(text_train[batch_indices_sen, :])\n", - " labels_tensor_sen.assign(labels_train[batch_indices_sen])\n", - " groups_tensor_sen.assign(groups_train[batch_indices_sen, :])\n", - "\n", - " # Gradient update.\n", - " optimizer.minimize(problem, var_list=var_list)\n", - " \n", - " # Record and print batch training stats every 10 steps.\n", - " if (batch_index + 1) % 10 == 0 or batch_index in (0, num_steps - 1):\n", - " hinge_loss = problem.objective()\n", - " max_violation = max(problem.constraints())\n", - "\n", - " elapsed_time = time.time() - start_time\n", - " sys.stdout.write(\n", - " \"\\rStep %d / %d: Elapsed time = %ds, Loss = %.3f, Violation = %.3f\" % \n", - " (batch_index + 1, num_steps, elapsed_time, hinge_loss, max_violation))\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DdJfplDpWs97" - }, - "source": [ - "Having trained the constrained model, we plot various evaluation metrics for the model on the test set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jEerPEwLhfTN" - }, - "outputs": [], - "source": [ - "scores_constrained_test = model_constrained.predict(text_test)\n", - "eval_result_constrained = evaluate_results(\n", - " labels_test, scores_constrained_test, groups_test, group_names)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ustp5z7xQnHI" - }, - "source": [ - "As with last time, remember to select false_positive_rate." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ztK7iM4LjKmT" - }, - "outputs": [], - "source": [ - "plot_fairness_indicators(eval_result_constrained, \"Constrained\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6P6dxSg5_mTu" - }, - "outputs": [], - "source": [ - "multi_results = {\n", - " 'constrained':eval_result_constrained,\n", - " 'unconstrained':eval_result_unconstrained,\n", - "}\n", - "plot_multi_fairness_indicators(multi_eval_results=multi_results)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EfKo5O3QWs9-" - }, - "source": [ - "As we can see from the Fairness Indicators, compared to the unconstrained model the constrained model yields significantly lower false positive rates for the sexuality-related comments, and does so with only a slight dip in the overall accuracy." - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Fairness Indicators TFCO Wiki Comments Case Study.ipynb", - "private_outputs": true, - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/g3doc/tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb b/g3doc/tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb deleted file mode 100644 index 40ad80f0..00000000 --- a/g3doc/tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb +++ /dev/null @@ -1,380 +0,0 @@ -{ - "cells": [ - { - "metadata": { - "id": "_E4uORykIpG4" - }, - "cell_type": "markdown", - "source": [ - "##### Copyright 2020 The TensorFlow Authors." - ] - }, - { - "metadata": { - "cellView": "form", - "id": "aBT221yVIujn" - }, - "cell_type": "code", - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "aalPefrUUplk" - }, - "cell_type": "markdown", - "source": [ - "# Fairness Indicators TensorBoard Plugin Example Colab" - ] - }, - { - "metadata": { - "id": "fFTJpyFlI-uI" - }, - "cell_type": "markdown", - "source": [ - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Fairness_indicators_TensorBoard_Plugin_Example_Colab\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/fairness-indicators/g3doc/tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "metadata": { - "id": "UZ48WFLwbCL6" - }, - "cell_type": "markdown", - "source": [ - "##Overview\n", - "\n", - "In this activity, you'll use [Fairness Indicators for TensorBoard](https://github.com/tensorflow/tensorboard/tree/master/docs/fairness-indicators.md). With the plugin, you can visualize fairness evaluations for your runs and easily compare performance across groups.\n" - ] - }, - { - "metadata": { - "id": "u33JXdluZ2lG" - }, - "cell_type": "markdown", - "source": [ - "# Importing\n", - "\n", - "Run the following code to install the required libraries." - ] - }, - { - "metadata": { - "id": "EoRNffG599XP" - }, - "cell_type": "code", - "source": [ - "!pip install -q -U pip==20.2\n", - "\n", - "!pip install fairness_indicators 'absl-py\u003c0.9,\u003e=0.7'\n", - "!pip install google-api-python-client==1.8.3\n", - "!pip install tensorboard-plugin-fairness-indicators\n", - "!pip install tensorflow-serving-api==2.17.1\n", - "!pip install tensorflow-model-analysis" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "mglfaM4_mtIk" - }, - "cell_type": "markdown", - "source": [ - "**Restart the runtime.** After the runtime is restarted, continue with following cells without running previous cell again." - ] - }, - { - "metadata": { - "id": "sFZJ8f_M7mlc" - }, - "cell_type": "code", - "source": [ - "# %tf.disable_v2_behavior()\t# Uncomment this line if running in Google Colab." - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "B8dlyTyiTe-9" - }, - "cell_type": "code", - "source": [ - "import datetime\n", - "import os\n", - "import tempfile\n", - "from tensorboard_plugin_fairness_indicators import summary_v2\n", - "import tensorflow.compat.v1 as tf\n", - "import numpy as np\n", - "from tensorflow import keras\n", - "from google.protobuf import text_format\n", - "\n", - "# example_model.py is provided in fairness_indicators package to train and\n", - "# evaluate an example model.\n", - "from fairness_indicators import example_model\n", - "import tensorflow_model_analysis as tfma\n", - "\n", - "tf.compat.v1.enable_eager_execution()" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "TsplOJGqWCf5" - }, - "cell_type": "markdown", - "source": [ - "# Data and Constants" - ] - }, - { - "metadata": { - "id": "NdLBi6tN5i7I" - }, - "cell_type": "code", - "source": [ - "# To know about dataset, check Fairness Indicators Example Colab at:\n", - "# https://github.com/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_Example_Colab.ipynb\n", - "\n", - "train_tf_file = tf.keras.utils.get_file('train.tf', 'https://storage.googleapis.com/civil_comments_dataset/train_tf_processed.tfrecord')\n", - "validate_tf_file = tf.keras.utils.get_file('validate.tf', 'https://storage.googleapis.com/civil_comments_dataset/validate_tf_processed.tfrecord')\n", - "\n", - "BASE_DIR = tempfile.gettempdir()\n", - "TEXT_FEATURE = 'comment_text'\n", - "LABEL = 'toxicity'\n", - "FEATURE_MAP = {\n", - " # Label:\n", - " LABEL: tf.io.FixedLenFeature([], tf.float32),\n", - " # Text:\n", - " TEXT_FEATURE: tf.io.FixedLenFeature([], tf.string),\n", - "\n", - " # Identities:\n", - " 'sexual_orientation': tf.io.VarLenFeature(tf.string),\n", - " 'gender': tf.io.VarLenFeature(tf.string),\n", - " 'religion': tf.io.VarLenFeature(tf.string),\n", - " 'race': tf.io.VarLenFeature(tf.string),\n", - " 'disability': tf.io.VarLenFeature(tf.string),\n", - "}" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "mfbgerCsEOmN" - }, - "cell_type": "markdown", - "source": [ - "# Train the Model" - ] - }, - { - "metadata": { - "id": "YwoC-dzEDid3" - }, - "cell_type": "code", - "source": [ - "model_dir = os.path.join(BASE_DIR, 'train',\n", - " datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "VqjEYySbYaX5" - }, - "cell_type": "code", - "source": [ - "classifier = example_model.get_example_model(example_model.TEXT_FEATURE)\n", - "classifier.compile(optimizer=keras.optimizers.Adam(), loss='mse')\n", - "\n", - "# Read the data from the training file\n", - "data = []\n", - "dataset = tf.data.Dataset.list_files(train_tf_file, shuffle=False)\n", - "dataset = dataset.flat_map(tf.data.TFRecordDataset)\n", - "for raw_record in dataset.take(1):\n", - " example = tf.train.Example()\n", - " example.ParseFromString(raw_record.numpy())\n", - " data.append(example)\n", - "\n", - "classifier.fit(\n", - " tf.constant([e.SerializeToString() for e in data]),\n", - " np.array([\n", - " e.features.feature[example_model.LABEL].float_list.value[:][0]\n", - " for e in data\n", - " ]),\n", - ")\n", - "classifier.save(model_dir, save_format='tf')" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "jTPqije9Eg5b" - }, - "cell_type": "markdown", - "source": [ - "# Run TensorFlow Model Analysis with Fairness Indicators\n", - "This step might take 2 to 5 minutes." - ] - }, - { - "metadata": { - "id": "QLjiy5VCzlRw" - }, - "cell_type": "code", - "source": [ - "tfma_eval_result_path = os.path.join(BASE_DIR, 'tfma_eval_result')\n", - "\n", - "eval_config = text_format.Parse(\n", - " \"\"\"\n", - " model_specs {\n", - " signature_name: \"serving_default\"\n", - " prediction_key: \"predictions\" # placeholder\n", - " label_key: \"toxicity\" # placeholder\n", - " }\n", - " slicing_specs {}\n", - " slicing_specs {\n", - " feature_keys: [\"gender\"]\n", - " }\n", - " metrics_specs {\n", - " metrics {\n", - " class_name: \"ExampleCount\"\n", - " }\n", - " metrics {\n", - " class_name: \"FairnessIndicators\"\n", - " }\n", - " }\n", - "\"\"\",\n", - " tfma.EvalConfig(),\n", - ")\n", - "\n", - "tfma_eval_result_path = os.path.join(model_dir, 'tfma_eval_result')\n", - "example_model.evaluate_model(\n", - " model_dir,\n", - " validate_tf_file,\n", - " tfma_eval_result_path,\n", - " eval_config,\n", - ")" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "U1ROnulYc8Ub" - }, - "cell_type": "markdown", - "source": [ - "# Visualize Fairness Indicators in TensorBoard\n", - "\n", - "\n", - "Below you will visualize Fairness Indicators in Tensorboard and compare performance of each slice of the data on selected metrics. You can adjust the baseline comparison slice as well as the displayed threshold(s) using the drop down menus at the top of the visualization. You can also select different evaluation runs using the drop down menu at the top-left corner." - ] - }, - { - "metadata": { - "id": "zCV-Jo0xda6g" - }, - "cell_type": "markdown", - "source": [ - "## Write Fairness Indicators Summary\n", - "Write summary file containing all required information to visualize Fairness Indicators in TensorBoard." - ] - }, - { - "metadata": { - "id": "JNaNhTCTAMHm" - }, - "cell_type": "code", - "source": [ - "import tensorflow.compat.v2 as tf2\n", - "\n", - "writer = tf2.summary.create_file_writer(\n", - " os.path.join(model_dir, 'fairness_indicators'))\n", - "with writer.as_default():\n", - " summary_v2.FairnessIndicators(tfma_eval_result_path, step=1)\n", - "writer.close()" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "MB2Gfm9BdXVY" - }, - "cell_type": "markdown", - "source": [ - "## Launch TensorBoard\n", - "Navigate to \"Fairness Indicators\" tab to visualize Fairness Indicators." - ] - }, - { - "metadata": { - "id": "UiHhDWu8tyEI" - }, - "cell_type": "code", - "source": [ - - "%load_ext tensorboard" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "ix6d718udWsK" - }, - "cell_type": "code", - "source": [ - - "%tensorboard --logdir=$model_dir" - ], - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/g3doc/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb b/g3doc/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb deleted file mode 100644 index 7e418336..00000000 --- a/g3doc/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb +++ /dev/null @@ -1,511 +0,0 @@ -{ - "cells": [ - { - "metadata": { - "id": "Tce3stUlHN0L" - }, - "cell_type": "markdown", - "source": [ - "##### Copyright 2020 The TensorFlow Authors." - ] - }, - { - "metadata": { - "cellView": "form", - "id": "tuOe1ymfHZPu" - }, - "cell_type": "code", - "source": [ - "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "aalPefrUUplk" - }, - "cell_type": "markdown", - "source": [ - "# Fairness Indicators on TF-Hub Text Embeddings" - ] - }, - { - "metadata": { - "id": "MfBg1C5NB3X0" - }, - "cell_type": "markdown", - "source": [ - "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/fairness-indicators/g3doc/tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca href=\"https://tfhub.dev/google/random-nnlm-en-dim128/1\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/hub_logo_32px.png\" /\u003eSee TF Hub model\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "metadata": { - "id": "w0zsksbydmNp" - }, - "cell_type": "markdown", - "source": [ - "In this tutorial, you will learn how to use [Fairness Indicators](https://github.com/tensorflow/fairness-indicators) to evaluate embeddings from [TF Hub](https://www.tensorflow.org/hub). This notebook uses the [Civil Comments dataset](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification)." - ] - }, - { - "metadata": { - "id": "u33JXdluZ2lG" - }, - "cell_type": "markdown", - "source": [ - "## Setup\n", - "\n", - "Install the required libraries." - ] - }, - { - "metadata": { - "id": "BAUEkqYlzP3W" - }, - "cell_type": "code", - "source": [ - "!pip install -q -U pip==20.2\n", - "\n", - "!pip install fairness-indicators \\\n", - " \"absl-py==0.12.0\" \\\n", - " \"pyarrow==10.0.1\" \\\n", - " \"apache-beam==2.50.0\" \\\n", - " \"avro-python3==1.9.1\"" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "e6pe8c6L7kCW" - }, - "cell_type": "markdown", - "source": [ - "Import other required libraries." - ] - }, - { - "metadata": { - "id": "B8dlyTyiTe-9" - }, - "cell_type": "code", - "source": [ - "import os\n", - "import tempfile\n", - "import apache_beam as beam\n", - "from datetime import datetime\n", - "import tensorflow as tf\n", - - "import tensorflow_hub as hub\n", - "import tensorflow_model_analysis as tfma\n", - "from tensorflow_model_analysis.addons.fairness.view import widget_view\n", - "from tensorflow_model_analysis.addons.fairness.post_export_metrics import fairness_indicators\n", - "from fairness_indicators import example_model\n", - "from fairness_indicators.tutorial_utils import util" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "Xz4PcI0hSVcq" - }, - "cell_type": "markdown", - "source": [ - "### Dataset\n", - "\n", - "In this notebook, you work with the [Civil Comments dataset](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification) which contains approximately 2 million public comments made public by the [Civil Comments platform](https://github.com/reaktivstudios/civil-comments) in 2017 for ongoing research. This effort was sponsored by Jigsaw, who have hosted competitions on Kaggle to help classify toxic comments as well as minimize unintended model bias.\n", - "\n", - "Each individual text comment in the dataset has a toxicity label, with the label being 1 if the comment is toxic and 0 if the comment is non-toxic. Within the data, a subset of comments are labeled with a variety of identity attributes, including categories for gender, sexual orientation, religion, and race or ethnicity." - ] - }, - { - "metadata": { - "id": "9ekzb7vVnPCc" - }, - "cell_type": "markdown", - "source": [ - "### Prepare the data\n", - "\n", - "TensorFlow parses features from data using [`tf.io.FixedLenFeature`](https://www.tensorflow.org/api_docs/python/tf/io/FixedLenFeature) and [`tf.io.VarLenFeature`](https://www.tensorflow.org/api_docs/python/tf/io/VarLenFeature). Map out the input feature, output feature, and all other slicing features of interest." - ] - }, - { - "metadata": { - "id": "n4_nXQDykX6W" - }, - "cell_type": "code", - "source": [ - "BASE_DIR = tempfile.gettempdir()\n", - "\n", - "# The input and output features of the classifier\n", - "TEXT_FEATURE = 'comment_text'\n", - "LABEL = 'toxicity'\n", - "\n", - "FEATURE_MAP = {\n", - " # input and output features\n", - " LABEL: tf.io.FixedLenFeature([], tf.float32),\n", - " TEXT_FEATURE: tf.io.FixedLenFeature([], tf.string),\n", - "\n", - " # slicing features\n", - " 'sexual_orientation': tf.io.VarLenFeature(tf.string),\n", - " 'gender': tf.io.VarLenFeature(tf.string),\n", - " 'religion': tf.io.VarLenFeature(tf.string),\n", - " 'race': tf.io.VarLenFeature(tf.string),\n", - " 'disability': tf.io.VarLenFeature(tf.string)\n", - "}\n", - "\n", - "IDENTITY_TERMS = ['gender', 'sexual_orientation', 'race', 'religion', 'disability']" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "CeUtnaT49Doq" - }, - "cell_type": "markdown", - "source": [ - "By default, the notebook downloads a preprocessed version of this dataset, but\n", - "you may use the original dataset and re-run the processing steps if\n", - "desired.\n", - "\n", - "In the original dataset, each comment is labeled with the percentage\n", - "of raters who believed that a comment corresponds to a particular\n", - "identity. For example, a comment might be labeled with the following:\n", - "`{ male: 0.3, female: 1.0, transgender: 0.0, heterosexual: 0.8,\n", - "homosexual_gay_or_lesbian: 1.0 }`.\n", - "\n", - "The processing step groups identity by category (gender,\n", - "sexual_orientation, etc.) and removes identities with a score less\n", - "than 0.5. So the example above would be converted to the following:\n", - "of raters who believed that a comment corresponds to a particular\n", - "identity. For example, the comment above would be labeled with the\n", - "following:\n", - "`{ gender: [female], sexual_orientation: [heterosexual,\n", - "homosexual_gay_or_lesbian] }`" - ] - }, - { - "metadata": { - "id": "FHxa31VX9eP2" - }, - "cell_type": "markdown", - "source": [ - "Download the dataset." - ] - }, - { - "metadata": { - "id": "NUmSmqYGS0n8" - }, - "cell_type": "code", - "source": [ - "download_original_data = False #@param {type:\"boolean\"}\n", - "\n", - "if download_original_data:\n", - " train_tf_file = tf.keras.utils.get_file('train_tf.tfrecord',\n", - " 'https://storage.googleapis.com/civil_comments_dataset/train_tf.tfrecord')\n", - " validate_tf_file = tf.keras.utils.get_file('validate_tf.tfrecord',\n", - " 'https://storage.googleapis.com/civil_comments_dataset/validate_tf.tfrecord')\n", - "\n", - " # The identity terms list will be grouped together by their categories\n", - " # (see 'IDENTITY_COLUMNS') on threshold 0.5. Only the identity term column,\n", - " # text column and label column will be kept after processing.\n", - " train_tf_file = util.convert_comments_data(train_tf_file)\n", - " validate_tf_file = util.convert_comments_data(validate_tf_file)\n", - "\n", - "else:\n", - " train_tf_file = tf.keras.utils.get_file('train_tf_processed.tfrecord',\n", - " 'https://storage.googleapis.com/civil_comments_dataset/train_tf_processed.tfrecord')\n", - " validate_tf_file = tf.keras.utils.get_file('validate_tf_processed.tfrecord',\n", - " 'https://storage.googleapis.com/civil_comments_dataset/validate_tf_processed.tfrecord')" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "zz1NLR5Uu3oQ" - }, - "cell_type": "markdown", - "source": [ - "## Create a TensorFlow Model Analysis Pipeline\n", - "\n", - "The Fairness Indicators library operates on [TensorFlow Model Analysis (TFMA) models](https://www.tensorflow.org/tfx/model_analysis/get_started). TFMA models wrap TensorFlow models with additional functionality to evaluate and visualize their results. The actual evaluation occurs inside of an [Apache Beam pipeline](https://beam.apache.org/documentation/programming-guide/).\n", - "\n", - "The steps you follow to create a TFMA pipeline are:\n", - "1. Build a TensorFlow model\n", - "2. Build a TFMA model on top of the TensorFlow model\n", - "3. Run the model analysis in an orchestrator. The example model in this notebook uses Apache Beam as the orchestrator." - ] - }, - { - "metadata": { - "id": "7nSvu4IUCigW" - }, - "cell_type": "code", - "source": [ - "def embedding_fairness_result(embedding, identity_term='gender'):\n", - " \n", - " model_dir = os.path.join(BASE_DIR, 'train',\n", - " datetime.now().strftime('%Y%m%d-%H%M%S'))\n", - "\n", - " print(\"Training classifier for \" + embedding)\n", - " classifier = example_model.train_model(model_dir,\n", - " train_tf_file,\n", - " LABEL,\n", - " TEXT_FEATURE,\n", - " FEATURE_MAP,\n", - " embedding)\n", - "\n", - " # Create a unique path to store the results for this embedding.\n", - " embedding_name = embedding.split('/')[-2]\n", - " eval_result_path = os.path.join(BASE_DIR, 'eval_result', embedding_name)\n", - "\n", - " example_model.evaluate_model(classifier,\n", - " validate_tf_file,\n", - " eval_result_path,\n", - " identity_term,\n", - " LABEL,\n", - " FEATURE_MAP)\n", - " return tfma.load_eval_result(output_path=eval_result_path)" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "jTPqije9Eg5b" - }, - "cell_type": "markdown", - "source": [ - "## Run TFMA \u0026 Fairness Indicators" - ] - }, - { - "metadata": { - "id": "8AvInTNt8Gyn" - }, - "cell_type": "markdown", - "source": [ - "### Fairness Indicators Metrics\n", - "\n", - "Some of the metrics available with Fairness Indicators are:\n", - "\n", - "* [Negative Rate, False Negative Rate (FNR), and True Negative Rate (TNR)](https://en.wikipedia.org/wiki/False_positives_and_false_negatives#False_positive_and_false_negative_rates)\n", - "* [Positive Rate, False Positive Rate (FPR), and True Positive Rate (TPR)](https://en.wikipedia.org/wiki/False_positives_and_false_negatives#False_positive_and_false_negative_rates)\n", - "* [Accuracy](https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Accuracy)\n", - "* [Precision and Recall](https://en.wikipedia.org/wiki/Precision_and_recall)\n", - "* [Precision-Recall AUC](https://www.tensorflow.org/api_docs/python/tf/keras/metrics/AUC)\n", - "* [ROC AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)" - ] - }, - { - "metadata": { - "id": "LGXCFtScblYt" - }, - "cell_type": "markdown", - "source": [ - "### Text Embeddings" - ] - }, - { - "metadata": { - "id": "1CI-1M5qXGjG" - }, - "cell_type": "markdown", - "source": [ - "**[TF-Hub](https://www.tensorflow.org/hub)** provides several **text embeddings**. These embeddings will serve as the feature column for the different models. This tutorial uses the following embeddings:\n", - "\n", - "* [**random-nnlm-en-dim128**](https://tfhub.dev/google/random-nnlm-en-dim128/1): random text embeddings, this serves as a convenient baseline.\n", - "* [**nnlm-en-dim128**](https://tfhub.dev/google/nnlm-en-dim128/1): a text embedding based on [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf). \n", - "* [**universal-sentence-encoder**](https://tfhub.dev/google/universal-sentence-encoder/2): a text embedding based on [Universal Sentence Encoder](https://arxiv.org/pdf/1803.11175.pdf)." - ] - }, - { - "metadata": { - "id": "xxq97Qt7itVL" - }, - "cell_type": "markdown", - "source": [ - "## Fairness Indicator Results" - ] - }, - { - "metadata": { - "id": "27FX15awixuK" - }, - "cell_type": "markdown", - "source": [ - "Compute fairness indicators with the `embedding_fairness_result` pipeline, and then render the results in the Fairness Indicator UI widget with `widget_view.render_fairness_indicator` for all the above embeddings.\n", - "\n", - "Note: You may need to run the `widget_view.render_fairness_indicator` cells twice for the visualization to be displayed." - ] - }, - { - "metadata": { - "id": "yEUbZ93y8NCW" - }, - "cell_type": "markdown", - "source": [ - "#### Random NNLM" - ] - }, - { - "metadata": { - "id": "DkSuox-Pb6Pz" - }, - "cell_type": "code", - "source": [ - "eval_result_random_nnlm = embedding_fairness_result('https://tfhub.dev/google/random-nnlm-en-dim128/1')" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "05xUesz6VpAe" - }, - "cell_type": "code", - "source": [ - "widget_view.render_fairness_indicator(eval_result=eval_result_random_nnlm)" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "jmKe8Z1b8SBy" - }, - "cell_type": "markdown", - "source": [ - "#### NNLM" - ] - }, - { - "metadata": { - "id": "5b8HcTUBckj1" - }, - "cell_type": "code", - "source": [ - "eval_result_nnlm = embedding_fairness_result('https://tfhub.dev/google/nnlm-en-dim128/1')" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "n6hasLzFVrDN" - }, - "cell_type": "code", - "source": [ - "widget_view.render_fairness_indicator(eval_result=eval_result_nnlm)" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "1I4xEDNq8T0X" - }, - "cell_type": "markdown", - "source": [ - "#### Universal Sentence Encoder" - ] - }, - { - "metadata": { - "id": "GrdweWRkck8A" - }, - "cell_type": "code", - "source": [ - "eval_result_use = embedding_fairness_result('https://tfhub.dev/google/universal-sentence-encoder/2')" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "JBABAkZMVtTK" - }, - "cell_type": "code", - "source": [ - "widget_view.render_fairness_indicator(eval_result=eval_result_use)" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "402oTKbap77R" - }, - "cell_type": "markdown", - "source": [ - "### Comparing Embeddings" - ] - }, - { - "metadata": { - "id": "UgnqwNjpqBuv" - }, - "cell_type": "markdown", - "source": [ - "You can also use Fairness Indicators to compare embeddings directly. For example, compare the models generated from the NNLM and USE embeddings." - ] - }, - { - "metadata": { - "id": "49ECfYWUp7Kk" - }, - "cell_type": "code", - "source": [ - "widget_view.render_fairness_indicator(multi_eval_results={'nnlm': eval_result_nnlm, 'use': eval_result_use})" - ], - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Fairness Indicators on TF-Hub Text Embeddings", - "private_outputs": true, - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..8c9fe594 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,89 @@ +site_name: Fairness Indicators +repo_name: "fairness-indicators" +repo_url: https://github.com/tensorflow/fairness-indicators + +theme: + name: material + logo: images/tf_full_color_primary_icon.svg + palette: + # Palette toggle for automatic mode + - media: "(prefers-color-scheme)" + primary: custom + accent: custom + toggle: + icon: material/brightness-auto + name: Switch to light mode + + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + primary: custom + accent: custom + scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + primary: custom + accent: custom + scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to system preference + favicon: images/tf_full_color_primary_icon.svg + + features: + - content.code.copy + - content.code.select + - content.action.edit + +extra_css: + - stylesheets/extra.css + +extra_javascript: + - javascripts/mathjax.js + - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js + +plugins: + - mkdocs-jupyter: + execute: false + +markdown_extensions: + - admonition + - attr_list + - def_list + - tables + - toc: + permalink: true + - pymdownx.highlight: + anchor_linenums: true + linenums: false + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - pymdownx.arithmatex: + generic: true + - pymdownx.critic + - pymdownx.caret + - pymdownx.keys + - pymdownx.mark + - pymdownx.tilde + - pymdownx.blocks.html + - md_in_html + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + +nav: + - "Overview": index.md + - "Thinking about Fairness Evaluation": guide/guidance.md + - "Introduction to Fairness Indicators": tutorials/Fairness_Indicators_Example_Colab.ipynb + - "Evaluate fairness using TF-Hub models": tutorials/Fairness_Indicators_on_TF_Hub_Text_Embeddings.ipynb + - "Visualize with Tensor Board Plugin": tutorials/Fairness_Indicators_TensorBoard_Plugin_Example_Colab.ipynb + - "Evaluate toxicity in Wiki comments": tutorials/Fairness_Indicators_TFCO_Wiki_Case_Study.ipynb + - "Tensor Flow constrained optimization example": tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study.ipynb + - "Pandas Data Frame case study": tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb + - "Face SSD example Colab": tutorials/Facessd_Fairness_Indicators_Example_Colab.ipynb diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 00000000..540e5699 --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,3 @@ +mkdocs +mkdocs-material +mkdocs-jupyter diff --git a/setup.py b/setup.py index 56c57079..2c4fc403 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ """Setup to install Fairness Indicators.""" import os +from pathlib import Path import sys import setuptools @@ -45,8 +46,12 @@ def select_constraint(default, nightly=None, git_master=None): ] TEST_PACKAGES = [ - 'pytest>=8.3.0,<9', + "pytest>=8.3.0,<9", ] + +with open(Path("./requirements-docs.txt").expanduser().absolute()) as f: + DOCS_PACKAGES = [req.strip() for req in f.readlines()] + # Get version from version module. with open('fairness_indicators/version.py') as fp: globals_dict = {} @@ -71,7 +76,8 @@ def select_constraint(default, nightly=None, git_master=None): install_requires=REQUIRED_PACKAGES, tests_require=REQUIRED_PACKAGES, extras_require={ - 'test': TEST_PACKAGES, + "docs": DOCS_PACKAGES, + "test": TEST_PACKAGES, }, # PyPI package information. classifiers=[