diff --git a/.github/ISSUE_TEMPLATE/config.yaml b/.github/ISSUE_TEMPLATE/config.yaml
new file mode 100644
index 000000000..ec4bb386b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yaml
@@ -0,0 +1 @@
+blank_issues_enabled: false
\ No newline at end of file
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
new file mode 100644
index 000000000..a7f0da96e
--- /dev/null
+++ b/.github/workflows/publish-docs.yml
@@ -0,0 +1,43 @@
+name: 'Build doc page'
+on:
+  push:
+    branches: [ main, doc-page ]
+    tags:
+      - "*"
+
+jobs:
+  publish_docs:
+    name: Build the docs using Sphinx and push to gh-pages
+    runs-on: ubuntu-latest
+    env:
+      python-version: 3.8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: setup python ${{ env.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.python-version }}
+      - name: Install Flair dependencies
+        run: pip install -e .
+      - name: Install unittest dependencies
+        run: pip install -r requirements-dev.txt
+      - name: Install doc dependencies
+        run: pip install -r docs/requirements.txt
+      - name: Fetch git tags
+        run: git fetch --tags origin
+      - name: Build docs
+        run: |
+          sphinx-multiversion docs doc_build/
+      - name: Add redirect to stable doc
+        run: |
+          cp assets/redirect.html doc_build/index.html
+          cp assets/redirect.html doc_build/404.html
+          cp assets/README.md doc_build/README.md
+          sed -i "s/\[VERSION\]/$(python -c 'import flair;print(flair.__version__)')/g" doc_build/index.html
+          sed -i "s/\[VERSION\]/$(python -c 'import flair;print(flair.__version__)')/g" doc_build/404.html
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./doc_build
\ No newline at end of file
diff --git a/assets/README.md b/assets/README.md
new file mode 100644
index 000000000..55b6e8da7
--- /dev/null
+++ b/assets/README.md
@@ -0,0 +1,6 @@
+# Docs For Flair NLP
+
+This branch is currently under construction.
+
+It will contain the docs for Flair NLP.
+Don't change files, as this branch will be autogenerated using github actions.
\ No newline at end of file
diff --git a/assets/redirect.html b/assets/redirect.html
new file mode 100644
index 000000000..e3b5ad9ba
--- /dev/null
+++ b/assets/redirect.html
@@ -0,0 +1,9 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>Redirecting to https://flairnlp.github.io/</title>
+    <meta charset="utf-8">
+    <meta http-equiv="refresh" content="0; URL=https://flairnlp.github.io/">
+    <link rel="canonical" href="https://flairnlp.github.io/">
+  </head>
+</html>
\ No newline at end of file
diff --git a/docs/_static/api.svg b/docs/_static/api.svg
new file mode 100644
index 000000000..21e58f2f7
--- /dev/null
+++ b/docs/_static/api.svg
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="utf-8"?><!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
+<svg width="800px" height="800px" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><rect width="24" height="24" fill="none"/><path d="M20,6H4A2,2,0,0,0,2,8v8a2,2,0,0,0,2,2H20a2,2,0,0,0,2-2V8A2,2,0,0,0,20,6ZM9.29,14.8,9,13.73H7.16L6.87,14.8H5.17L7,9.07H9.09L11,14.8Zm6.34-3.14a1.7,1.7,0,0,1-.36.64,1.82,1.82,0,0,1-.67.44,2.75,2.75,0,0,1-1,.17h-.44V14.8H11.6V9.09h2a2.43,2.43,0,0,1,1.62.47,1.67,1.67,0,0,1,.55,1.35A2.36,2.36,0,0,1,15.63,11.66Zm2.58,3.14H16.66V9.09h1.55ZM8.45,11.53l.24.93H7.48l.24-.93c0-.13.08-.28.12-.47s.09-.38.13-.57a4.63,4.63,0,0,0,.1-.48c0,.13.07.29.11.5l.15.58Zm5.59-1a.57.57,0,0,1,.16.43.75.75,0,0,1-.11.42.59.59,0,0,1-.27.22.9.9,0,0,1-.37.07h-.31V10.34h.4A.63.63,0,0,1,14,10.51Z" fill-rule="evenodd"/></svg>
\ No newline at end of file
diff --git a/docs/_static/contributing.svg b/docs/_static/contributing.svg
new file mode 100644
index 000000000..c9f5e2780
--- /dev/null
+++ b/docs/_static/contributing.svg
@@ -0,0 +1 @@
+<svg data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" width="808.14938" height="589.44401" viewBox="0 0 808.14938 589.44401" xmlns:xlink="http://www.w3.org/1999/xlink"><path d="M1010.64622,651.37416c-2.71155-20.47851-15.57545-40.57509-35.17783-47.09113a95.1759,95.1759,0,0,1,.51268,65.34266c-2.93967,8.14954-7.05786,16.9077-4.16734,25.07485,1.79845,5.08184,6.13028,8.94647,10.95718,11.34616,4.8272,2.39969,10.17216,3.51931,15.45016,4.61623l1.04376.84941C1007.47971,692.55879,1013.35776,671.85268,1010.64622,651.37416Z" transform="translate(-223.31656 -125.21687)" fill="#f0f0f0"/><path d="M975.27746,604.67216a81.3498,81.3498,0,0,1,20.58531,45.62207,35.03206,35.03206,0,0,1-.31163,10.97177,20.09236,20.09236,0,0,1-4.92329,9.35855c-2.23269,2.49243-4.80513,4.78336-6.39308,7.77513a12.30262,12.30262,0,0,0-.5215,10.04784c1.45513,4.06963,4.28576,7.3737,7.16617,10.5086,3.19815,3.48072,6.57517,7.04542,7.95963,11.68254.16774.56185,1.02974.26871.86224-.29229-2.40871-8.06779-10.29479-12.59884-14.096-19.85247-1.7737-3.38468-2.53814-7.32439-.92895-10.92483,1.40716-3.14843,4.05786-5.51313,6.34032-8.01723a21.44009,21.44009,0,0,0,5.15486-8.97129,32.38605,32.38605,0,0,0,.7323-10.91931A78.92362,78.92362,0,0,0,990.95,627.72335a82.79233,82.79233,0,0,0-15.07556-23.73876c-.39271-.43281-.98715.2576-.597.68757Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M995.57408,644.78666a12.20505,12.20505,0,0,0,9.18894-12.8584c-.051-.58356-.96023-.53171-.90915.05262a11.30163,11.30163,0,0,1-8.57208,11.94354c-.56886.14-.27332,1.00146.29229.86224Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M991.92636,669.50988a23.52422,23.52422,0,0,1-10.61193-13.46464c-.16973-.56124-1.03178-.26827-.86225.29229a24.46878,24.46878,0,0,0,11.07116,13.98874c.50721.29546.90748-.52253.403-.81639Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M986.47747,619.87779a6.90865,6.90865,0,0,1-6.5495-.2805c-.50321-.30174-.90295.5166-.403.81638a7.74171,7.74171,0,0,0,7.24481.32637.47052.47052,0,0,0,.285-.57727.45752.45752,0,0,0-.57727-.285Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M916.16848,641.36805c.30886.19735.61771.3947.927.59974a90.98207,90.98207,0,0,1,11.70478,8.99c.2864.25132.57285.51034.85143.76937a95.91645,95.91645,0,0,1,21.05621,28.34934,93.14616,93.14616,0,0,1,5.20662,13.16021c1.93212,6.23195,3.53118,13.1427,7.29832,18.22288a15.97692,15.97692,0,0,0,1.257,1.52688l33.67775-.26666c.07624-.039.1529-.07039.22951-.10941l1.34514.05083c-.05591-.23776-.119-.48316-.17493-.72093-.03186-.13805-.07122-.27605-.10308-.41411-.02362-.092-.04761-.18408-.06366-.26847-.00775-.03066-.01587-.06131-.02356-.08433-.01605-.08439-.03992-.16107-.05591-.23776q-.5234-2.059-1.07755-4.11788c-.00006-.00769-.00006-.00769-.00762-.01532a136.95938,136.95938,0,0,0-11.76516-30.16719c-.15608-.28307-.31184-.57384-.4833-.8568a88.88277,88.88277,0,0,0-8.07936-12.04632,78.57691,78.57691,0,0,0-5.28005-5.94415,65.34166,65.34166,0,0,0-16.44775-12.11837c-12.12945-6.28182-26.13292-8.61451-39.01093-4.61665C916.82,641.15545,916.49779,641.25785,916.16848,641.36805Z" transform="translate(-223.31656 -125.21687)" fill="#f0f0f0"/><path d="M916.24758,641.79489a81.34988,81.34988,0,0,1,43.90385,24.03288,35.03233,35.03233,0,0,1,6.35695,8.948,20.09248,20.09248,0,0,1,1.70352,10.43644c-.28207,3.3343-.95672,6.71227-.42336,10.05708a12.30267,12.30267,0,0,0,5.63311,8.33662c3.612,2.37328,7.86142,3.30716,12.04869,4.076,4.64918.85365,9.49174,1.66667,13.389,4.53561.47221.34761.984-.40543.51248-.75251-6.78059-4.99147-15.80519-3.86129-23.20741-7.36434-3.454-1.63458-6.43634-4.32-7.31921-8.16357-.772-3.361-.07931-6.845.23547-10.21862a21.44023,21.44023,0,0,0-1.28547-10.26666,32.38621,32.38621,0,0,0-5.98948-9.15935A78.92392,78.92392,0,0,0,942.63965,650.764a82.79235,82.79235,0,0,0-26.32937-9.87755c-.57413-.10913-.63309.8-.0627.90843Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M956.605,661.6041a12.20506,12.20506,0,0,0-.4048-15.79909c-.39208-.43523-1.08682.15358-.69422.58939a11.30162,11.30162,0,0,1,.34651,14.69722c-.36991.45429.38472.96417.75251.51248Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M968.57758,683.54039a23.52429,23.52429,0,0,1-16.57968-4.36165c-.47342-.34593-.98533.407-.51248.75251a24.46877,24.46877,0,0,0,17.26189,4.50362c.58287-.06946.41-.96357-.16973-.89448Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M934.345,647.19255a6.90864,6.90864,0,0,1-5.39829,3.71928c-.58345.062-.40993.95612.16973.89449a7.74174,7.74174,0,0,0,5.98107-4.10129.47053.47053,0,0,0-.12-.6325.45753.45753,0,0,0-.63249.12Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M788.70874,445.35256a6.6176,6.6176,0,0,0,10.13608.47785l21.56551,9.37924-2.1006-12.03868-20.15693-7.061a6.65349,6.65349,0,0,0-9.44406,9.24262Z" transform="translate(-223.31656 -125.21687)" fill="#a0616a"/><path d="M808.33018,438.55131l41.19723,5.33928,42.485-25.59256a18.78361,18.78361,0,0,1,26.80668,8.35028h0a18.80382,18.80382,0,0,1-8.4331,24.39255l-57.08925,20.34376-.10746.03844-50.08943-22.31169Z" transform="translate(-223.31656 -125.21687)" fill="#3f3d56"/><polygon points="690.298 579.764 681.54 579.764 677.374 545.984 690.3 545.985 690.298 579.764" fill="#a0616a"/><path d="M915.848,713.4702l-28.2383-.001V713.112a10.99172,10.99172,0,0,1,10.99113-10.991h.0007l5.1581-3.9132,9.62386,3.9138,2.465.0001Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><polygon points="656.36 574.245 647.818 576.174 636.313 544.142 648.921 541.295 656.36 574.245" fill="#a0616a"/><path d="M883.72548,707.25094l-27.54492,6.21926-.07868-.3484a10.99172,10.99172,0,0,1,8.3001-13.14209l.00068-.00016,4.1694-4.9533,10.2496,1.69774,2.40451-.54289Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M924.70647,498.47061c4.106,12.3837,3.67191,25.82586-.98693,40.253L921.728,619.95178s2.96729,53.26781-6.6076,72.01238H901.23414l-6.87529-76.877-.86335-51.60031-29.77062,43.53325,18.9618,83.68716L865.596,692.23513l-30.76995-90.82,37.964-88.11122Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M868.49283,510.44241l3.42446-8.561-2.26477-7.36036-.51563-71.32157-.00028-.05432,8.21752-13.10363.06685-8.288,22.94605-.55964,2.48846,6.31934,4.924,2.9442A35.64637,35.64637,0,0,1,925.08672,441.638l-.88947,48.12063,2.28984,4.58024a9.63783,9.63783,0,0,1-5.67975,13.49028l-50.4935,8.07657Z" transform="translate(-223.31656 -125.21687)" fill="#3f3d56"/><circle cx="663.47615" cy="247.27029" r="20.8488" fill="#a0616a"/><path d="M908.5027,377.03213c.62184-2.6197.33635-6.30613-.01506-8.94389a25.03486,25.03486,0,0,0-10.5963-17.38372,7.63326,7.63326,0,0,0-3.80218-1.511,3.28226,3.28226,0,0,0-3.3002,1.99619,6.84189,6.84189,0,0,0-3.8598-3.82756,12.05793,12.05793,0,0,0-5.49444-.74021,20.08039,20.08039,0,0,0-16.75192,11.9747c-.55687,1.32291-2.11229-3.56352-2.81785-2.31357a5.55717,5.55717,0,0,1-3.15143,2.76892c-1.38251.38578-1.944,6.08184-2.4881,4.75361a6.33176,6.33176,0,0,0,6.05342,8.8288,34.87245,34.87245,0,0,0,4.143-.793,10.32927,10.32927,0,0,1,11.3093,7.73074,13.68475,13.68475,0,0,1,1.80033-4.00553,4.63181,4.63181,0,0,1,3.71585-2.04166,5.2038,5.2038,0,0,1,3.57592,2.05552c.91785,1.07772,1.6072,2.3351,2.53182,3.407,2.71306,3.14524,6.38255,10.4801,10.24894,9.94514C902.65525,388.51049,907.65366,380.609,908.5027,377.03213Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M809.07054,500.68736a6.61764,6.61764,0,0,0,9.07527-4.53962l23.39771-2.36389-7.71912-9.474L812.789,488.00707a6.65349,6.65349,0,0,0-3.71842,12.68029Z" transform="translate(-223.31656 -125.21687)" fill="#a0616a"/><path d="M822.86038,485.16l38.54676-15.48762,24.54475-43.09887a18.78361,18.78361,0,0,1,27.46639-5.82433h0a18.80381,18.80381,0,0,1,4.5716,25.40107L878.13941,491.812l-.07493.08608-54.60267,5.031Z" transform="translate(-223.31656 -125.21687)" fill="#3f3d56"/><path d="M1030.27526,714.66088h-347.988a1.19069,1.19069,0,1,1,0-2.38137h347.988a1.19069,1.19069,0,1,1,0,2.38137Z" transform="translate(-223.31656 -125.21687)" fill="#ccc"/><path d="M805.38289,521.01774H647.24317a4.16145,4.16145,0,0,1-4.15662-4.15662V452.69488a4.16146,4.16146,0,0,1,4.15662-4.15663H805.38289a4.16139,4.16139,0,0,1,4.15662,4.15663v64.16624A4.16138,4.16138,0,0,1,805.38289,521.01774Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><path d="M790.32508,551.79867h-128.024a4.98795,4.98795,0,1,1,0-9.97589h128.024C796.75761,541.733,796.822,551.88927,790.32508,551.79867Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M790.32508,574.6601h-128.024a4.98795,4.98795,0,1,1,0-9.9759h128.024C796.75761,564.59441,796.822,574.7507,790.32508,574.6601Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M790.32508,597.52152h-128.024a4.98795,4.98795,0,1,1,0-9.97589h128.024C796.75761,587.45584,796.822,597.61212,790.32508,597.52152Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M252.32744,350.82694c-27.83858-36.57706-39.05316-88.47429-18.33162-129.504A211.7829,211.7829,0,0,0,335.67037,325.267c17.42189,8.253,37.65858,15.61834,45.90545,33.04327,5.13162,10.84211,4.35538,23.73629.5,35.09462-3.8558,11.35878-10.53652,21.50912-17.14717,31.51858l-.31528,2.9778C321.91979,410.86836,280.166,387.404,252.32744,350.82694Z" transform="translate(-223.31656 -125.21687)" fill="#f0f0f0"/><path d="M234.90751,221.63772c1.70626,37.60771,15.43815,74.91386,39.06105,104.29815,5.10722,6.35282,10.841,12.35276,17.70007,16.82962a44.70911,44.70911,0,0,0,22.44916,7.05027c7.43334.4324,15.08691.01389,22.2857,2.24569,7.5879,2.35244,13.24276,7.92665,16.58226,15.04208,4.086,8.70592,4.80008,18.36081,5.17031,27.82674.41107,10.51012.67152,21.43336,5.7591,30.92419.61644,1.14995-1.20394,2.0392-1.81944.891-8.85153-16.51242-3.51028-36.033-8.88695-53.44429-2.50885-8.12442-7.4813-15.542-15.66826-18.70125-7.15915-2.76263-15.05194-2.33781-22.58214-2.71057-7.90684-.39141-15.31136-2.06365-22.2073-6.07581-7.05417-4.10421-13.03457-9.89263-18.2818-16.087a175.61939,175.61939,0,0,1-28.14647-47.12278,184.22731,184.22731,0,0,1-13.436-61.115c-.05894-1.2991,1.96218-1.14165,2.02073.149Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M265.78615,316.7897a27.15837,27.15837,0,0,1-34.67145-5.88435c-.83475-1.00113.68173-2.3453,1.51758-1.34285a25.14805,25.14805,0,0,0,32.26288,5.40776c1.11752-.6712,2.00212,1.15208.891,1.81944Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M310.31991,350.09292a52.34556,52.34556,0,0,0-4.36763-37.89706c-.61233-1.15209,1.20785-2.04169,1.81945-.891a54.44728,54.44728,0,0,1,4.46475,39.44456c-.3372,1.26188-2.25193.59851-1.91657-.65651Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M241.07747,263.20516a15.373,15.373,0,0,0,9.8981-10.715c.321-1.26553,2.23551-.60078,1.91656.65651a17.22666,17.22666,0,0,1-10.92367,11.87795,1.047,1.047,0,0,1-1.35522-.46423,1.01806,1.01806,0,0,1,.46423-1.35522Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M385.762,186.85342c-.178.79592-.356,1.59184-.52259,2.40058a202.45258,202.45258,0,0,0-4.37543,32.54807c-.05788.84589-.1038,1.704-.13725,2.54979a213.43075,213.43075,0,0,0,11.22712,77.77264A207.26709,207.26709,0,0,0,404.376,331.06328c6.72447,12.86716,15.0393,26.28341,17.06085,40.2106a35.55473,35.55473,0,0,1,.41063,4.38162L368.27065,428.055c-.18152.058-.35174.12871-.53386.18727l-2.04349,2.19c-.28466-.463-.57-.94931-.85461-1.4123-.16624-.26789-.32054-.54745-.48678-.81534-.107-.18228-.21357-.36523-.32059-.52361-.03586-.06054-.07112-.12167-.09508-.17005-.107-.15838-.18961-.31685-.28466-.463q-2.40326-4.071-4.75763-8.18989c-.012-.01224-.012-.01224-.012-.03615-11.90612-20.95328-22.16757-43.08461-28.74369-66.06987-.19762-.69161-.40781-1.39487-.581-2.11041a197.78094,197.78094,0,0,1-6.14065-31.68621,174.84773,174.84773,0,0,1-.98857-17.66379,145.397,145.397,0,0,1,6.9554-44.92495c9.29327-28.93943,27.73842-54.58458,54.336-68.4721C384.4001,187.53974,385.06935,187.196,385.762,186.85342Z" transform="translate(-223.31656 -125.21687)" fill="#f0f0f0"/><path d="M386.30659,187.65123C365.0265,218.70616,353.52977,256.76058,354.7,294.44491c.253,8.14727,1.21868,16.39,3.99991,24.09417a44.7091,44.7091,0,0,0,13.67965,19.14519c5.67477,4.82064,12.03769,9.09445,16.44183,15.21059,4.64218,6.44673,5.80121,14.302,4.18363,21.99392-1.97915,9.41123-7.22187,17.55006-12.62542,25.331-5.9996,8.63922-12.36818,17.51763-14.02017,28.15861-.20016,1.2893-2.189.90333-1.98916-.384,2.87417-18.51348,18.8916-30.88379,25.08142-48.02284,2.88829-7.99739,3.384-16.91366-1.25079-24.36527-4.05287-6.51611-10.6106-10.92892-16.39862-15.76025-6.07751-5.073-10.9828-10.86619-14.07323-18.22151-3.16134-7.52408-4.45132-15.74643-4.9115-23.85149a175.6193,175.6193,0,0,1,5.89779-54.571,184.22712,184.22712,0,0,1,26.06753-56.8863c.73508-1.07275,2.254.26982,1.52376,1.33556Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M353.67338,282.21583A27.15836,27.15836,0,0,1,329.533,256.64289c-.06375-1.30191,1.95636-1.46214,2.02019-.15849a25.14806,25.14806,0,0,0,22.50425,23.74227c1.29639.13691.905,2.12529-.384,1.98916Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M369.18027,335.619a52.34554,52.34554,0,0,0,19.32933-32.88829c.20472-1.28855,2.19364-.903,1.98916.384a54.44726,54.44726,0,0,1-20.18349,34.18237c-1.029.80452-2.15839-.87794-1.135-1.6781Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M366.20648,224.55524a15.37292,15.37292,0,0,0,14.35425-2.596c1.01827-.81717,2.14664.86624,1.135,1.67809a17.22664,17.22664,0,0,1-15.87329,2.90707,1.047,1.047,0,0,1-.80257-1.18659,1.01807,1.01807,0,0,1,1.1866-.80257Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><circle cx="176.73495" cy="141.33694" r="22.44541" fill="#2f2e41"/><path d="M522.86615,482.5485H364.72643a4.16146,4.16146,0,0,1-4.15662-4.15663V414.22563a4.16145,4.16145,0,0,1,4.15662-4.15662H522.86615a4.16139,4.16139,0,0,1,4.15662,4.15662v64.16624A4.16139,4.16139,0,0,1,522.86615,482.5485Z" transform="translate(-223.31656 -125.21687)" fill="#d1d3d4"/><path d="M507.80833,513.32943h-128.024a4.988,4.988,0,0,1,0-9.9759h128.024C514.24087,503.26374,514.30521,513.42,507.80833,513.32943Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M507.80833,536.19085h-128.024a4.98795,4.98795,0,1,1,0-9.97589h128.024C514.24087,526.12517,514.30521,536.28145,507.80833,536.19085Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M507.80833,559.05228h-128.024a4.98795,4.98795,0,0,1,0-9.9759h128.024C514.24087,548.9866,514.30521,559.14288,507.80833,559.05228Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M518.3203,310.88021a7.5527,7.5527,0,0,1-11.506,1.31694l-56.03547,23.91971,1.47421-13.86917,54.6082-21.17247a7.59361,7.59361,0,0,1,11.45907,9.805Z" transform="translate(-223.31656 -125.21687)" fill="#ffb6b6"/><path d="M404.26434,309.59186s-6.9281,6.729-.05264,15.38831,40.87031,14.28989,46.32241,15.36146,35.77912-19.55715,35.77912-19.55715l2.93726-14.94511-33.85165,11.614Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><path d="M385.65859,360.6475l.54554,10.36527s-1.6102,9.1817-.28592,11.1631c1.31431,1.97149,37.17883,2.22041,37.17883,2.22041s2.36973-12.77463,4.41087-23.95613Z" transform="translate(-223.31656 -125.21687)" fill="#ffb6b6"/><polygon points="265.457 372.482 272.955 370.968 270.809 340.682 259.743 342.918 265.457 372.482" fill="#ffb6b6"/><polygon points="276.197 333.07 283.167 329.936 274.125 300.959 263.837 305.586 276.197 333.07" fill="#ffb6b6"/><path d="M505.58509,441.9601l-13.12456,7.43147-32.368-52.01294c-22.72256,15.30844-44.94235,22.63954-67.34188,26.55157-7.57317,1.34845-15.2945.66509-16.74841-7.04294q-.102-.54051-.16257-1.08705c-.89431-8.41941,9.57876-19.21125,10.91-29.5123l28.663-1.72061,3.53455,4.53182,33.14777-20.37939a14.22944,14.22944,0,0,1,19.58307,4.95118q.32862.54118.61018,1.10917Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M497.55209,476.94969l-15.01776,1.39566L455.30469,422.656c-27.0018,4.64388-52.47556,6.13153-74.50924.51382a14.24211,14.24211,0,0,1-10.18528-17.20508q.12864-.53472.29751-1.0581c2.637-8.04565,9.53976-13.73838,14.97824-22.5875l39.72453.60342-4.437,15.3091,38.58961-4.99362a14.22943,14.22943,0,0,1,15.8303,12.54641q.07779.62835.10166,1.26184Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><circle cx="177.85296" cy="149.54605" r="16.13042" fill="#ffb6b6"/><path d="M391.71309,296.71644l14.7901,1.74a3.018,3.018,0,0,1,2.59579,2.353l2.35209,10.763,14.66229,17.26667s5.26507,9.87206,3.29069,21.0604S426.562,369.4013,426.562,369.4013s-39.33133,14.747-40.6476,12.77256-1.74344-10.94267-1.74344-10.94267L378.90588,312.657s5.92323-9.87205,9.21392-9.87205C390.24358,302.71331,389.97113,297.93343,391.71309,296.71644Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><ellipse cx="203.64529" cy="243.2793" rx="3.75539" ry="4.50647" fill="#f79910"/><path d="M485.05261,489.90106l11.431-5.72874,2.15445,5.02613,14.45491,2.922a3.41579,3.41579,0,0,1,.6692,6.48775l-13.96637,5.98667-4.5384-3.93972,1.19114,5.37452-5.26588,2.25722Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M496.242,455.87627l11.32849-5.9289,2.24251,4.98747,14.50407,2.6673a3.41578,3.41578,0,0,1,.7832,6.475l-13.85893,6.23139-4.607-3.8593,1.28548,5.35274-5.22537,2.34948Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M391.95364,269.70328a20.64237,20.64237,0,0,1-11.90679-3.75646l-.11641-.08151v-.142a15.56559,15.56559,0,0,1,15.5479-15.5479h6.54649a15.56559,15.56559,0,0,1,15.54791,15.5479,3.62006,3.62006,0,0,1-3.54069,3.60249l-21.71054.37426C392.199,269.70221,392.07591,269.70328,391.95364,269.70328Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M456.88021,395.17611a7.55265,7.55265,0,0,1-8.22151-8.15655l-53.70233-28.77824,11.74595-7.52062,50.66588,29.38193a7.59361,7.59361,0,0,1-.488,15.07348Z" transform="translate(-223.31656 -125.21687)" fill="#ffb6b6"/><path d="M386.5765,305.35513s-9.58323-1.2-12.04279,9.57987,14.4,40.83164,16.97242,45.75671,37.633,15.69661,37.633,15.69661l13.5004-7.05147L412.41086,350.1784Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><path d="M403.66608,252.57923c1.50926-1.85063.32945-4.59164-.90821-6.63391q-4.25372-7.01907-8.92988-13.77023c-3.89552-5.62391-8.61563-11.5035-15.32276-12.85178-6.11687-1.22961-12.27907,1.67716-17.54144,5.02911-5.17385,3.29557-10.26964,7.44685-12.23887,13.25646-1.88453,5.55973-.561,11.7338,1.78383,17.11563,2.00823,4.60934,4.71,10.12111,1.84595,14.25346a11.05671,11.05671,0,0,1-5.29648,3.60469,37.8325,37.8325,0,0,1-16.62932,2.45369c7.76709-.0835,14.56583,7.89179,22.09541,5.98389,4.26985-1.08192,7.11355-5.06266,9.15993-8.96324a25.11031,25.11031,0,0,0,2.98931-8.466c.49749-4.04034-.57852-8.292.7799-12.12949,1.41377-3.99388,5.17214-6.66612,8.96571-8.55255,2.51875-1.25249,5.26467-2.29852,8.06961-2.0862,7.95119.60184,12.06933,10.38851,19.61418,12.969" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M673.90537,307.19618H515.76565a4.16145,4.16145,0,0,1-4.15662-4.15663V238.87331a4.16145,4.16145,0,0,1,4.15662-4.15662H673.90537a4.16139,4.16139,0,0,1,4.15662,4.15662v64.16624A4.16139,4.16139,0,0,1,673.90537,307.19618Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><path d="M658.84755,337.97711h-128.024a4.98795,4.98795,0,0,1,0-9.9759h128.024C665.28009,327.91142,665.34443,338.06771,658.84755,337.97711Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M658.84755,360.83853h-128.024a4.98795,4.98795,0,1,1,0-9.97589h128.024C665.28009,350.77285,665.34443,360.92913,658.84755,360.83853Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M658.84755,383.7h-128.024a4.98795,4.98795,0,0,1,0-9.9759h128.024C665.28009,373.63428,665.34443,383.79056,658.84755,383.7Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M842.65819,148.60552H312.91682a2.9705,2.9705,0,0,1-2.96725-2.96725V128.18412a2.9705,2.9705,0,0,1,2.96725-2.96725H842.65819a2.9705,2.9705,0,0,1,2.96725,2.96725v17.45415A2.9705,2.9705,0,0,1,842.65819,148.60552Z" transform="translate(-223.31656 -125.21687)" fill="#e6e7e8"/><circle id="b637d542-0ea5-46b9-aa61-095218110267" data-name="Ellipse 90" cx="103.85322" cy="10.84298" r="4.21275" fill="#fff"/><circle id="b2f1169c-b566-47c9-8d4c-e5da87bed52d" data-name="Ellipse 91" cx="119.84345" cy="10.84298" r="4.21275" fill="#fff"/><circle id="a4343e91-3550-42c2-91d2-f5254c8e4c9c" data-name="Ellipse 92" cx="135.83441" cy="10.84298" r="4.21275" fill="#fff"/><path d="M826.99117,133.16028H808.57783a1.364,1.364,0,0,1,0-2.72791h18.41334a1.364,1.364,0,0,1,0,2.72791Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M826.99117,138.27509H808.57783a1.36395,1.36395,0,0,1,0-2.7279h18.41334a1.364,1.364,0,1,1,0,2.7279Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M826.99117,143.38991H808.57783a1.36395,1.36395,0,1,1,0-2.7279h18.41334a1.36395,1.36395,0,1,1,0,2.7279Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/></svg>
\ No newline at end of file
diff --git a/docs/_static/favicon.ico b/docs/_static/favicon.ico
new file mode 100644
index 000000000..e497a6306
Binary files /dev/null and b/docs/_static/favicon.ico differ
diff --git a/docs/_static/tutorial.svg b/docs/_static/tutorial.svg
new file mode 100644
index 000000000..97559bd8a
--- /dev/null
+++ b/docs/_static/tutorial.svg
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
+<svg fill="#000000" height="800px" width="800px" version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" 
+	 viewBox="0 0 512 512" xml:space="preserve">
+<g>
+	<g>
+		<path d="M418.472,17.102H204.159v-0.534C204.159,7.432,196.727,0,187.591,0h-34.205c-9.136,0-16.568,7.432-16.568,16.568v0.534
+			h-43.29c-13.851,0-25.119,11.268-25.119,25.119v444.66c0,13.851,11.268,25.119,25.119,25.119h324.944
+			c13.851,0,25.119-11.268,25.119-25.119V42.221C443.591,28.371,432.323,17.102,418.472,17.102z M152.852,16.568
+			c0-0.295,0.239-0.534,0.534-0.534h34.205c0.295,0,0.534,0.239,0.534,0.534v79.083l-13.19-8.794
+			c-1.347-0.898-2.897-1.347-4.447-1.347c-1.55,0-3.1,0.449-4.447,1.347l-13.19,8.794V16.568z M102.614,495.967h-9.086
+			c-5.01,0-9.086-4.076-9.086-9.086V42.221c0-5.01,4.076-9.086,9.086-9.086h9.086V495.967z M427.558,486.881h-0.001
+			c0,5.01-4.076,9.086-9.086,9.086H118.647V33.136h18.171v77.495c0,2.957,1.627,5.674,4.234,7.069
+			c2.607,1.395,5.77,1.243,8.229-0.399l21.207-14.138l21.207,14.138c1.341,0.895,2.892,1.347,4.448,1.347
+			c1.297,0,2.597-0.314,3.783-0.948c2.606-1.395,4.234-4.112,4.234-7.069V33.136h214.313c5.01,0,9.086,4.076,9.086,9.086V486.881z"
+			/>
+	</g>
+</g>
+<g>
+	<g>
+		<path d="M375.716,136.818H256c-9.136,0-16.568,7.432-16.568,16.568v77.495h-77.495c-9.136,0-16.568,7.432-16.568,16.568v119.716
+			c0,9.136,7.432,16.568,16.568,16.568h213.779c9.136,0,16.568-7.432,16.568-16.568V153.386
+			C392.284,144.25,384.852,136.818,375.716,136.818z M376.251,367.165c0,0.295-0.239,0.534-0.534,0.534H161.937
+			c-0.295,0-0.534-0.239-0.534-0.534V247.449c0-0.295,0.239-0.534,0.534-0.534h85.511c4.427,0,8.017-3.589,8.017-8.017v-85.511
+			c0-0.295,0.239-0.534,0.534-0.534h119.716c0.295,0,0.534,0.239,0.534,0.534V367.165z"/>
+	</g>
+</g>
+<g>
+	<g>
+		<path d="M315.858,222.33c-4.427,0-8.017,3.589-8.017,8.017v68.944h-59.568l3.624-2.416c3.684-2.455,4.68-7.433,2.223-11.117
+			s-7.434-4.679-11.116-2.223l-25.653,17.102c-2.23,1.487-3.57,3.989-3.57,6.67s1.339,5.184,3.57,6.67l25.653,17.102
+			c1.367,0.912,2.912,1.348,4.439,1.348c2.59,0,5.133-1.254,6.677-3.571c2.456-3.683,1.461-8.661-2.223-11.117l-3.624-2.416h67.585
+			c4.427,0,8.017-3.589,8.017-8.017v-76.96C323.875,225.919,320.285,222.33,315.858,222.33z"/>
+	</g>
+</g>
+</svg>
\ No newline at end of file
diff --git a/docs/_templates/page.html b/docs/_templates/page.html
new file mode 100644
index 000000000..d7b48c82d
--- /dev/null
+++ b/docs/_templates/page.html
@@ -0,0 +1,19 @@
+{% extends "!page.html" %}
+{% block body %}
+{% if current_version and latest_version and current_version != latest_version and current_version != release and current_version.name != latest_version.release %}
+<p>
+  <strong>
+    {% if current_version.is_released %}
+        {% if latest_version.release.replace('v', '').split('.') | map('int') | list > current_version.name.replace('v', '').split('.') | map('int') | list  %}
+          You're reading an old version of this documentation.
+          If you want up-to-date information, please have a look at <a href="{{ vpathto(latest_version.name) }}">{{latest_version.name}}</a>.
+        {% endif %}
+    {% else %}
+    You're reading the documentation for a development version.
+    For the latest stable version, please have a look at <a href="{{ vpathto(latest_version.name) }}">{{latest_version.name}}</a>.
+    {% endif %}
+  </strong>
+</p>
+{% endif %}
+{{ super() }}
+{% endblock %}%
\ No newline at end of file
diff --git a/docs/_templates/version-switcher.html b/docs/_templates/version-switcher.html
new file mode 100644
index 000000000..1d21c6c65
--- /dev/null
+++ b/docs/_templates/version-switcher.html
@@ -0,0 +1,30 @@
+{# As the version switcher will only work when JavaScript is enabled, we add it through JavaScript.
+ #}
+<script>
+document.write(`
+  <div class="version-switcher__container dropdown">
+    <button id="versionswitcherbutton" type="button" role="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown" aria-haspopup="listbox" aria-controls="versionswitcherlist" aria-label="Version switcher list">
+      {% if current_version.is_released %} {{ current_version.name }} {% if latest_version and item == latest_version %} (stable) {% endif %} {% else %} latest (dev) {% endif %}
+      <span class="caret"></span>
+    </button>
+    <div id="versionswitcherlist" class="version-switcher__menu dropdown-menu list-group-flush py-0" role="listbox" aria-labelledby="versionswitcherbutton">
+    <!-- dropdown will be populated by javascript on page load -->
+    {%- for item in versions|reverse %}
+      <a class="list-group-item list-group-item-action py-1" href="{{ item.url }}" data-version-name="dev" data-version="devdocs">
+      <span>{% if item.is_released %}
+    {{ item.name }}
+      {% if latest_version and item == latest_version %}
+      (stable)
+      {% endif %}
+    {% else %}
+    latest (dev)
+    {% endif %}
+    {% if item == current_version %}
+    [x]
+    {% endif %}</span>
+      </a>
+    {%- endfor %}
+    </div>
+  </div>
+`);
+</script>
diff --git a/docs/_templates/versioning.html b/docs/_templates/versioning.html
new file mode 100644
index 000000000..0c8784af0
--- /dev/null
+++ b/docs/_templates/versioning.html
@@ -0,0 +1,17 @@
+{% if versions %}
+<h3>{{ _('Versions') }}</h3>
+<ul>
+  {%- for item in versions|reverse %}
+  <li><a href="{{ item.url }}">
+    {% if item.is_released %}
+    {{ item.name }}
+      {% if latest_version and item == latest_version %}
+      (stable)
+      {% endif %}
+    {% else %}
+    latest ({{ item.name }})
+    {% endif %}
+  </a></li>
+  {%- endfor %}
+</ul>
+{% endif %}
\ No newline at end of file
diff --git a/docs/api/datasets/base.rst b/docs/api/datasets/base.rst
new file mode 100644
index 000000000..e42784deb
--- /dev/null
+++ b/docs/api/datasets/base.rst
@@ -0,0 +1,4 @@
+flair.datasets.base
+===================
+
+.. automodule:: flair.datasets.base
\ No newline at end of file
diff --git a/docs/api/datasets/biomedical.rst b/docs/api/datasets/biomedical.rst
new file mode 100644
index 000000000..d59bd8c58
--- /dev/null
+++ b/docs/api/datasets/biomedical.rst
@@ -0,0 +1,4 @@
+flair.datasets.biomedical
+=========================
+
+.. automodule:: flair.datasets.biomedical
\ No newline at end of file
diff --git a/docs/api/datasets/document_classification.rst b/docs/api/datasets/document_classification.rst
new file mode 100644
index 000000000..d8303f3ae
--- /dev/null
+++ b/docs/api/datasets/document_classification.rst
@@ -0,0 +1,4 @@
+flair.datasets.document_classification
+======================================
+
+.. automodule:: flair.datasets.document_classification
\ No newline at end of file
diff --git a/docs/api/datasets/entity_linking.rst b/docs/api/datasets/entity_linking.rst
new file mode 100644
index 000000000..cdb2b3235
--- /dev/null
+++ b/docs/api/datasets/entity_linking.rst
@@ -0,0 +1,4 @@
+flair.datasets.entity_linking
+=============================
+
+.. automodule:: flair.datasets.entity_linking
\ No newline at end of file
diff --git a/docs/api/datasets/ocr.rst b/docs/api/datasets/ocr.rst
new file mode 100644
index 000000000..3f8534044
--- /dev/null
+++ b/docs/api/datasets/ocr.rst
@@ -0,0 +1,4 @@
+flair.datasets.ocr
+==================
+
+.. automodule:: flair.datasets.ocr
\ No newline at end of file
diff --git a/docs/api/datasets/relation_extraction.rst b/docs/api/datasets/relation_extraction.rst
new file mode 100644
index 000000000..62dcdd55d
--- /dev/null
+++ b/docs/api/datasets/relation_extraction.rst
@@ -0,0 +1,4 @@
+flair.datasets.relation_extraction
+==================================
+
+.. automodule:: flair.datasets.relation_extraction
\ No newline at end of file
diff --git a/docs/api/datasets/sequence_labeling.rst b/docs/api/datasets/sequence_labeling.rst
new file mode 100644
index 000000000..875d4831b
--- /dev/null
+++ b/docs/api/datasets/sequence_labeling.rst
@@ -0,0 +1,4 @@
+flair.datasets.sequence_labeling
+================================
+
+.. automodule:: flair.datasets.sequence_labeling
\ No newline at end of file
diff --git a/docs/api/datasets/text_image.rst b/docs/api/datasets/text_image.rst
new file mode 100644
index 000000000..f14e56491
--- /dev/null
+++ b/docs/api/datasets/text_image.rst
@@ -0,0 +1,4 @@
+flair.datasets.text_image
+=========================
+
+.. automodule:: flair.datasets.text_image
\ No newline at end of file
diff --git a/docs/api/datasets/text_text.rst b/docs/api/datasets/text_text.rst
new file mode 100644
index 000000000..f88dfd1ae
--- /dev/null
+++ b/docs/api/datasets/text_text.rst
@@ -0,0 +1,4 @@
+flair.datasets.text_text
+=========================
+
+.. automodule:: flair.datasets.text_text
\ No newline at end of file
diff --git a/docs/api/datasets/treebanks.rst b/docs/api/datasets/treebanks.rst
new file mode 100644
index 000000000..0d6c14a28
--- /dev/null
+++ b/docs/api/datasets/treebanks.rst
@@ -0,0 +1,4 @@
+flair.datasets.treebanks
+========================
+
+.. automodule:: flair.datasets.treebanks
\ No newline at end of file
diff --git a/docs/api/embeddings/base.rst b/docs/api/embeddings/base.rst
new file mode 100644
index 000000000..1bf51ffa7
--- /dev/null
+++ b/docs/api/embeddings/base.rst
@@ -0,0 +1,4 @@
+flair.embeddings.base
+=====================
+
+.. automodule:: flair.embeddings.base
\ No newline at end of file
diff --git a/docs/api/embeddings/document.rst b/docs/api/embeddings/document.rst
new file mode 100644
index 000000000..ca870fc8e
--- /dev/null
+++ b/docs/api/embeddings/document.rst
@@ -0,0 +1,4 @@
+flair.embeddings.document
+=========================
+
+.. automodule:: flair.embeddings.document
\ No newline at end of file
diff --git a/docs/api/embeddings/image.rst b/docs/api/embeddings/image.rst
new file mode 100644
index 000000000..2a701b9e0
--- /dev/null
+++ b/docs/api/embeddings/image.rst
@@ -0,0 +1,4 @@
+flair.embeddings.image
+======================
+
+.. automodule:: flair.embeddings.image
\ No newline at end of file
diff --git a/docs/api/embeddings/legacy.rst b/docs/api/embeddings/legacy.rst
new file mode 100644
index 000000000..974a777eb
--- /dev/null
+++ b/docs/api/embeddings/legacy.rst
@@ -0,0 +1,8 @@
+flair.embeddings.legacy
+============================
+
+.. warning::
+   All embeddings in `flair.embeddings.legacy` are considered deprecated.
+   there is no guarantee that they are still working and we recommend using different embeddings instead.
+
+.. automodule:: flair.embeddings.legacy
\ No newline at end of file
diff --git a/docs/api/embeddings/token.rst b/docs/api/embeddings/token.rst
new file mode 100644
index 000000000..3705fedb1
--- /dev/null
+++ b/docs/api/embeddings/token.rst
@@ -0,0 +1,4 @@
+flair.embeddings.token
+======================
+
+.. automodule:: flair.embeddings.token
\ No newline at end of file
diff --git a/docs/api/embeddings/transformer.rst b/docs/api/embeddings/transformer.rst
new file mode 100644
index 000000000..2bda02f77
--- /dev/null
+++ b/docs/api/embeddings/transformer.rst
@@ -0,0 +1,4 @@
+flair.embeddings.transformer
+============================
+
+.. automodule:: flair.embeddings.transformer
\ No newline at end of file
diff --git a/docs/api/flair.data.rst b/docs/api/flair.data.rst
new file mode 100644
index 000000000..00dd67a52
--- /dev/null
+++ b/docs/api/flair.data.rst
@@ -0,0 +1,4 @@
+flair.data
+==========
+
+.. automodule:: flair.data
\ No newline at end of file
diff --git a/docs/api/flair.datasets.rst b/docs/api/flair.datasets.rst
new file mode 100644
index 000000000..9a883c3e6
--- /dev/null
+++ b/docs/api/flair.datasets.rst
@@ -0,0 +1,8 @@
+flair.datasets
+==============
+
+.. toctree::
+   :glob:
+   :maxdepth: 2
+
+   datasets/*
diff --git a/docs/api/flair.embeddings.rst b/docs/api/flair.embeddings.rst
new file mode 100644
index 000000000..3f70e62be
--- /dev/null
+++ b/docs/api/flair.embeddings.rst
@@ -0,0 +1,8 @@
+flair.embeddings
+================
+
+.. toctree::
+   :glob:
+   :maxdepth: 2
+
+   embeddings/*
\ No newline at end of file
diff --git a/docs/api/flair.models.rst b/docs/api/flair.models.rst
new file mode 100644
index 000000000..8679b3fb7
--- /dev/null
+++ b/docs/api/flair.models.rst
@@ -0,0 +1,4 @@
+flair.models
+============
+
+.. automodule:: flair.models
\ No newline at end of file
diff --git a/docs/api/flair.nn.rst b/docs/api/flair.nn.rst
new file mode 100644
index 000000000..4eb066d3e
--- /dev/null
+++ b/docs/api/flair.nn.rst
@@ -0,0 +1,4 @@
+flair.nn
+========
+
+.. automodule:: flair.nn
\ No newline at end of file
diff --git a/docs/api/flair.rst b/docs/api/flair.rst
new file mode 100644
index 000000000..4e12a0382
--- /dev/null
+++ b/docs/api/flair.rst
@@ -0,0 +1,4 @@
+flair
+=====
+
+.. automodule:: flair
\ No newline at end of file
diff --git a/docs/api/flair.splitter.rst b/docs/api/flair.splitter.rst
new file mode 100644
index 000000000..5863df578
--- /dev/null
+++ b/docs/api/flair.splitter.rst
@@ -0,0 +1,4 @@
+flair.splitter
+==============
+
+.. automodule:: flair.splitter
\ No newline at end of file
diff --git a/docs/api/flair.tokenization.rst b/docs/api/flair.tokenization.rst
new file mode 100644
index 000000000..00f2bc4bf
--- /dev/null
+++ b/docs/api/flair.tokenization.rst
@@ -0,0 +1,4 @@
+flair.tokenization
+==================
+
+.. automodule:: flair.tokenization
\ No newline at end of file
diff --git a/docs/api/flair.trainers.plugins.rst b/docs/api/flair.trainers.plugins.rst
new file mode 100644
index 000000000..4bb766876
--- /dev/null
+++ b/docs/api/flair.trainers.plugins.rst
@@ -0,0 +1,4 @@
+flair.trainers.plugins
+======================
+
+.. automodule:: flair.trainers.plugins
\ No newline at end of file
diff --git a/docs/api/flair.trainers.rst b/docs/api/flair.trainers.rst
new file mode 100644
index 000000000..db11b5029
--- /dev/null
+++ b/docs/api/flair.trainers.rst
@@ -0,0 +1,4 @@
+flair.trainers
+==============
+
+.. automodule:: flair.trainers
\ No newline at end of file
diff --git a/docs/api/index.rst b/docs/api/index.rst
new file mode 100644
index 000000000..0f67f3cf2
--- /dev/null
+++ b/docs/api/index.rst
@@ -0,0 +1,9 @@
+API Docs
+========
+
+.. toctree::
+   :glob:
+   :maxdepth: 2
+
+   flair
+   flair.*
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 000000000..8f448b937
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,133 @@
+# noqa: INP001
+
+import importlib_metadata
+
+# -- Project information -----------------------------------------------------
+from sphinx_github_style import get_linkcode_resolve
+
+version = "0.12.2"
+release = "0.12.2"
+project = "flair"
+author = importlib_metadata.metadata(project)["Author"]
+copyright = f"2023 {author}"
+
+# The full version, including alpha/beta/rc tags
+top_level = project.replace("-", "_")
+
+linkcode_url = importlib_metadata.metadata(project)["Home-page"]
+html_show_sourcelink = True
+
+smv_current_version = ""  # will by overwritten by sphinx-multi-version to the name of the tag or branch.
+html_context = {
+    "display_github": True,
+    "github_user": "flairNLP",
+    "github_repo": "flair",
+    "github_version": "",
+    "conf_py_path": "/docs/",
+}  # dummy value that sphinx-github-style won't crash when run in temp folder.
+
+html_theme_options = {
+    "navbar_end": ["theme-switcher", "version-switcher", "navbar-icon-links"],
+    "github_url": linkcode_url,
+    "icon_links": [
+        {
+            "name": "PyPI",
+            "url": "https://pypi.org/project/flair",
+            "icon": "fas fa-box",
+        },
+    ],
+}
+
+
+def linkcode_resolve(*args):
+    # use smv_current_version as the git url
+    real_linkcode_url = linkcode_url + f"/blob/{smv_current_version}/" + "{filepath}#L{linestart}-L{linestop}"
+    return get_linkcode_resolve(real_linkcode_url)(*args)
+
+
+# -- General configuration ---------------------------------------------------
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.doctest",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.ifconfig",
+    "sphinx.ext.napoleon",  # to render Google format docstrings
+    "sphinx.ext.githubpages",
+    "myst_parser",
+    "sphinx_github_style",
+    "sphinx_autodoc_typehints",
+    "sphinx_multiversion",
+    "sphinx_design",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "pydata_sphinx_theme"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+# Napoleon settings
+napoleon_include_init_with_doc = True
+napoleon_include_private_with_doc = True
+
+autodoc_default_options = {
+    "member-order": "bysource",
+    "undoc-members": True,
+    "members": True,
+    "show-inheritance": True,
+    "private-members": False,
+    "inherited": True,
+}
+
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".md": "markdown",
+}
+
+html_sidebars = {
+    "**": [
+        "globaltoc.html",
+        "searchbox.html",
+        "versioning.html",
+    ],
+    "index": [],
+}
+
+smv_latest_version = importlib_metadata.version(project)
+
+# Whitelist pattern for tags (set to None to ignore all tags)
+smv_tag_whitelist = r"^\d+\.\d+\.\d+$"
+
+# Whitelist pattern for branches (set to None to ignore all branches)
+smv_branch_whitelist = r"^master$"
+
+# Whitelist pattern for remotes (set to None to use local branches only)
+smv_remote_whitelist = r"^origin$"
+
+# Pattern for released versions
+smv_released_pattern = r"^refs/tags/\d+\.\d+\.\d+$"
+
+# Format for versioned output directories inside the build directory
+smv_outputdir_format = "{ref.name}"
+
+# Determines whether remote or local git branches/tags are preferred if their output dirs conflict
+smv_prefer_remote_refs = False
+
+html_favicon = "_static/favicon.ico"
diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst
new file mode 100644
index 000000000..ef72362f3
--- /dev/null
+++ b/docs/contributing/index.rst
@@ -0,0 +1,10 @@
+Contributing
+============
+
+.. toctree::
+   :maxdepth: 1
+
+   writing_a_good_issue
+   local_development
+   making_a_pull_request
+   updating_documentation
diff --git a/docs/contributing/local_development.md b/docs/contributing/local_development.md
new file mode 100644
index 000000000..87439439f
--- /dev/null
+++ b/docs/contributing/local_development.md
@@ -0,0 +1,47 @@
+# Local Development
+
+For contributors looking to get deeper into the API we suggest cloning the repository and checking out the unit
+tests for examples of how to call methods. Most classes and methods are documented, so finding your way around
+the code should hopefully be easy.
+
+## Setup
+
+Flair requires python-3.8 or higher. To make sure our code also runs on the oldest supported
+python version, it is recommended to use python-3.8.x for flair development.
+
+Create a python environment of your preference and run:
+```bash
+pip install -r requirements-dev.txt
+pip install -e .
+```
+
+## Tests
+
+To only run typechecks and check the code formatting execute:
+
+```bash
+pytest flair
+```
+
+To run all basic tests execute:
+
+```bash
+pytest
+```
+
+To run integration tests execute:
+
+```bash
+pytest --runintegration
+```
+
+The integration tests will train small models and therefore take more time.
+In general, it is recommended to ensure all basic tests are running through before testing the integration tests
+
+## Code Formatting
+
+To ensure a standardized code style we use the formatter [black](https://github.com/ambv/black) and for standardizing imports we use [ruff](https://github.com/charliermarsh/ruff).
+If your code is not formatted properly, the tests will fail.
+
+We recommend configuring your IDE to run these formatters for you, but you can also always run them manually via
+`black . && ruff --fix .` in the flair root folder.
\ No newline at end of file
diff --git a/docs/contributing/making_a_pull_request.md b/docs/contributing/making_a_pull_request.md
new file mode 100644
index 000000000..ae795ec68
--- /dev/null
+++ b/docs/contributing/making_a_pull_request.md
@@ -0,0 +1,17 @@
+# Making a pull request
+
+We are happy to accept your contributions to make `flair` better and more awesome! To avoid unnecessary work on either
+side, please stick to the following process:
+
+1. Check if there is already [an issue](https://github.com/flairNLP/flair/issues) for your concern.
+2. If there is not, open a new one to start a discussion. We hate to close finished PRs!
+3. If we decide your concern needs code changes, we would be happy to accept a pull request. Please consider the
+   commit guidelines below.
+
+
+## Git Commit Guidelines
+
+If there is already a ticket, use this number at the start of your commit message.
+Use meaningful commit messages that described what you did.
+
+**Example:** `GH-42: Added new type of embeddings: DocumentEmbedding.`
\ No newline at end of file
diff --git a/docs/contributing/updating_documentation.md b/docs/contributing/updating_documentation.md
new file mode 100644
index 000000000..f97679362
--- /dev/null
+++ b/docs/contributing/updating_documentation.md
@@ -0,0 +1,46 @@
+# Updating documentation
+
+
+## What is good documentation?
+
+Good Documentation
+* Always refers to the enduser. Do not document *why* something is the way it is, but rather *how* to use it.
+* Doesn't lie and is always up-to-ate. Whenever code is updated, consider if the documentation needs to change accordingly to reflect reality.
+* Provides useful links whenever usable. Do not reference another object without linking it.
+
+
+## Tutorials
+
+All tutorials are markdown files stored at [the tutorial folder](https://github.com/flairNLP/flair/tree/master/docs/tutorial).
+When adding a new tutorial, you must add its name to the `index.rst` file in the respective folder.
+We are using the [MyST parser](https://myst-parser.readthedocs.io/en/latest/syntax/typography.html) which adds
+some additional syntax over markdown.
+
+A tutorial should always be easy to understand, and reference api documentation for future readings.
+
+```{note}
+  You can reference symbols by defining links
+  e.g.: ``[`flair.set_seed`](#flair.set_seed)`` for a function
+  e.g.: `[entity-linking](project:../tutorial/tutorial-basics/entity-linking.md)` for another tutorial
+```
+
+## Docstrings
+
+For docstrings we follow the [Google docstring](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) format.
+We do not need to specify types or default values, as those will be extracted from the function signature.
+
+Docstrings have usual a 1 liner giving a simple explanation of the object. Then there is a more detailed explanation followed **if required**.
+Ensure that you always use cross-references instead of just mentioning another object,
+e.g. ``:class:`flair.models.SequenceTagger` `` can be used to reference the SequenceTagger.
+
+
+## Building the local docs
+
+For building the docs,
+
+* Ensure that you have everything committed. Local changes won't be used for building.
+* Install the build dependencies via `pip install -r docs/requirements.txt`.
+* In `docs/conf.py` temporarily add your local branch name to the `smv_branch_whitelist` pattern. 
+  E.g. if your branch is called `doc-page` `smv_branch_whitelist` need to have the value `r"^master|doc-page$"`
+* run `sphinx-multiversion docs doc_build/` to generate the docs.
+* open `doc_build/<your branch name>/index.html` to view the docs.
diff --git a/docs/contributing/writing_a_good_issue.md b/docs/contributing/writing_a_good_issue.md
new file mode 100644
index 000000000..d3b5d4ff4
--- /dev/null
+++ b/docs/contributing/writing_a_good_issue.md
@@ -0,0 +1,59 @@
+# Writing a good issue
+
+You are likely reading this, because you want to create an issue.
+This is great, issues are a great way to provide feedback which then can be used to enhance the library.
+Here are some guidelines to make the issue as insightful as possible.
+
+## Issue types
+
+Before you start with the issue, you need to choose the type.
+There are 3 issue types:
+
+* **Bug Report** -> You have noticed something that doesn't work the way it is expected.
+* **Feature/Enhancement request** -> You have an idea for something that would make flair better.
+* **Question** -> You have a question that is unrelated to a potential bug or feature request.
+
+### Bug Report
+
+A bug report is one of the most common issues. It is simple: you tried something, but it didn't work as expected.
+It is important to provide as much context as possible, so ensure that you ran the [collect_env.py](https://github.com/flairNLP/flair/blob/master/collect_env.py) and if required  and created a minimal reproducible example.
+The minimal reproducible example has, like the name says two properties:
+* it is reproducible
+* it is as small as possible
+
+**Reproducibility**
+
+Please ensure that we can really reproduce your issue.
+
+You might have encountered the issue while training on your custom dataset and don't want to share it. That is ok,
+but maybe you can test if you can recreate the same bug by using one of the manny public datasets instead and if not,
+maybe filter the problem down to a single sentence and report what property it has. 
+
+It is also possible, that you have encountered the issue while predicting some sentences. Maybe you don't want to share
+your trained model, but maybe you can recreate the issue by creating a model without training it?
+
+Please, be sure to not add local paths, or load any data that others have no access.
+
+**Minimal**
+
+After ensuring reproducibility, please also take some time to make it minimal. That way, we can quicker understand
+what the issue is and won't need to spend time debugging code that is unrelated to the issue.
+
+For example, you might get an error where the stack trace shows that it occurred while saving the model. In that case,
+you can verify, if the model really needs to be trained on the full dataset for 100 epochs and test if it instead would be enough
+to just create a model and save it with no training involved.
+
+### Feature/Enhancement request
+
+For a Feature/Enhancement request, please provide not only the *what* but also the *why*, it is easier to judge how important a feature is,
+when you know why it is wanted and what it could provide to the users.
+
+### Question
+
+Questions are the most generic types of issues, but also those whose usually lack most of the context.
+Please ensure that you are not creating a Question that should actually be a bug report.
+
+For example issues like: `[Question]: Something is wrong with ...`, `[Question]: sentence.to_dict(tag_type='ner') no longer have ...`
+or `[Question]: MultiTagger cannot be loaded...` are examples for issues that clearly should be bug reports instead and
+could have been resolved quicker, if enough context and a minimal reproducible example were provided.
+
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 000000000..0e37ddc70
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,77 @@
+flair
+=====
+
+.. _flair_docs_mainpage:
+
+
+**Version**: |version|
+
+**Useful links**:
+`Getting started <gtutorial/index/intro.html>`_ |
+`Source Repository <https://github.com/flairNLP/flair>`_ |
+`Issue Tracker <https://github.com/flairNLP/flair/issues>`_ |
+
+Flair is a very simple framework for state-of-the-art Natural Language Processing (NLP)
+
+.. grid:: 2
+
+    .. grid-item-card::
+        :img-top: ./_static/tutorial.svg
+
+        Tutorial
+        ^^^^^^^^
+
+        New to Flair? Check out the Tutorials. It contains an introduction to Flair's main concepts.
+
+        +++
+
+        .. button-ref:: tutorial/index
+            :expand:
+            :color: secondary
+            :click-parent:
+
+            To the tutorials
+
+    .. grid-item-card::
+        :img-top: ./_static/api.svg
+
+        API-docs
+        ^^^^^^^^
+
+        The API-docs provides in-depth information on the classes and functions designed for public use.
+
+        +++
+
+        .. button-ref:: api/index
+            :expand:
+            :color: secondary
+            :click-parent:
+
+            To the API docs
+
+    .. grid-item-card::
+        :img-top: ./_static/contributing.svg
+
+        Contributor's Guide
+        ^^^^^^^^^^^^^^^^^^^
+
+        Want to add to the codebase? Can help add translation or a flowchart to the
+        documentation? The contributing guidelines will guide you through the
+        process of improving NumPy.
+
+        +++
+
+        .. button-ref:: contributing/index
+            :expand:
+            :color: secondary
+            :click-parent:
+
+            To the contributor's guide
+
+.. toctree::
+   :maxdepth: 3
+   :hidden:
+
+   Tutorials <tutorial/index>
+   API reference <api/index>
+   Contributing <contributing/index>
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 000000000..b6a98f118
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,8 @@
+sphinx-github-style<=1.0.2  # 1.0.3 changes logic that breaks with sphinx-multiversion
+sphinx-autodoc-typehints
+myst-parser
+sphinx
+importlib-metadata
+sphinx-multiversion
+pydata-sphinx-theme
+sphinx_design
\ No newline at end of file
diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst
new file mode 100644
index 000000000..9636c33c5
--- /dev/null
+++ b/docs/tutorial/index.rst
@@ -0,0 +1,13 @@
+Tutorials
+=========
+
+
+.. _flair_tutorials:
+
+.. toctree::
+   :maxdepth: 2
+
+   intro
+   tutorial-basics/index
+   tutorial-training/index
+   tutorial-embeddings/index
\ No newline at end of file
diff --git a/docs/tutorial/intro.md b/docs/tutorial/intro.md
new file mode 100644
index 000000000..44eb7f1b3
--- /dev/null
+++ b/docs/tutorial/intro.md
@@ -0,0 +1,92 @@
+---
+sidebar_position: 1
+---
+
+(getting_started)=
+
+# Quick Start
+
+Let's discover **Flair in less than 5 minutes**.
+
+## Requirements and Installation
+
+In your favorite virtual environment, simply do:
+
+```
+pip install flair
+```
+
+Flair requires Python 3.7+. 
+
+## Example 1: Tag Entities in Text
+
+Let's run **named entity recognition**  (NER) over the following example sentence: "_I love Berlin and New York._"
+
+Our goal is to identify names in this sentence, and their types.
+
+To do this, all you need is to make a [`Sentence`](#flair.data.Sentence) for this text, load a pre-trained model and use it to predict tags for the sentence:
+
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('I love Berlin and New York.')
+
+# load the NER tagger
+tagger = Classifier.load('ner')
+
+# run NER over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+This should print:
+
+```console
+Sentence[7]: "I love Berlin and New York." → ["Berlin"/LOC, "New York"/LOC]
+```
+
+The output shows that both "Berlin" and "New York" were tagged as **location entities** (LOC) in this sentence.
+
+
+## Example 2: Detect Sentiment 
+
+Let's run **sentiment analysis** over the same sentence to determine whether it is POSITIVE or NEGATIVE.
+
+You can do this with essentially the same code as above. Just instead of loading the 'ner' model, you now load the 'sentiment' model:
+
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('I love Berlin and New York.')
+
+# load the sentiment tagger
+tagger = Classifier.load('sentiment')
+
+# run sentiment analysis over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+
+```
+
+This should print:
+
+```console
+Sentence[7]: "I love Berlin and New York." → POSITIVE (0.9982)
+```
+
+The output shows that the sentence "_I love Berlin and New York._" was tagged as having **POSITIVE** sentiment. 
+
+
+## Summary
+
+Congrats, you now know how to use Flair to find entities and detect sentiment!
\ No newline at end of file
diff --git a/docs/tutorial/tutorial-basics/basic-types.md b/docs/tutorial/tutorial-basics/basic-types.md
new file mode 100644
index 000000000..703a5d7cd
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/basic-types.md
@@ -0,0 +1,269 @@
+# Basics
+
+This tutorial explains the basic concepts used in Flair:
+
+-    what is a [`Sentence`](#flair.data.Sentence)
+-    what is a [`Label`](#flair.data.Label)
+
+You should be familiar with these two concepts in order to get the most out of Flair.
+
+## What is a Sentence
+
+If you want to tag a sentence, you need to first make a [`Sentence`](#flair.data.Sentence) object for it.
+
+For example, say you want to tag the text "_The grass is green._".
+
+Let's start by making a [`Sentence`](#flair.data.Sentence) object for this sentence.
+
+
+```python
+# The sentence objects holds a sentence that we may want to embed or tag
+from flair.data import Sentence
+
+# Make a sentence object by passing a string
+sentence = Sentence('The grass is green.')
+
+# Print the object to see what's in there
+print(sentence)
+```
+
+This should print:
+
+```console
+Sentence[5]: "The grass is green."
+```
+
+The print-out tells us that the sentence consists of 5 tokens.
+
+```{note}
+A token is an atomic unit of the text, often a word or punctuation. The printout is therefore telling us that the sentence "_The grass is green._" consists of 5 such atomic units. 
+```
+
+### Iterating over the tokens in a Sentence
+
+So what are the 5 tokens in this example sentence?
+
+You can iterate over all tokens in a sentence like this:
+
+
+```python
+for token in sentence:
+    print(token)
+```
+
+This should print:
+
+```console
+Token[0]: "The"
+Token[1]: "grass"
+Token[2]: "is"
+Token[3]: "green"
+Token[4]: "."
+```
+
+This printout is telling us that the 5 tokens in the text are the words "_The_", "_grass_", "_is_", "_green_", with a separate token for the full stop at the end. The tokens therefore correspond to the words and the punctuation of the text.
+
+### Directly accessing a token
+
+You can access the tokens of a sentence via their token id or with their index:
+
+```python
+# using the token id
+print(sentence.get_token(4))
+# using the index itself
+print(sentence[3])
+```
+
+which should print in both cases
+
+```console
+Token[3]: "green"
+```
+
+This print-out includes the token index (3) and the lexical value of the token ("green"). 
+
+### Tokenization
+
+When you create a [`Sentence`](#flair.data.Sentence) as above, the text is automatically tokenized (segmented into words) using the [segtok](https://pypi.org/project/segtok/) library.
+
+```{note}
+You can also use a different tokenizer if you like. To learn more about this, check out our tokenization tutorial.
+```
+
+
+## What is a Label
+
+All Flair models predict labels. For instance, our sentiment analysis models will predict labels for a sentence. Our NER models will predict labels for tokens in a sentence.
+
+### Example 1: Labeling a token in a sentence
+
+To illustrate how labels work, let's use the same example sentence as above: "_The grass is green._".
+
+Let us label all "color words" in this sentence. Since the sentence contains only one color word (namely "green"), we only need to add a label to one of the tokens.
+
+We access token 3 in the sentence, and set a label for it: 
+
+```python
+# Make a sentence object by passing a string
+sentence = Sentence('The grass is green.')
+
+# add an NER tag to token 3 in the sentence
+sentence[3].add_label('ner', 'color')
+
+# print the sentence (now with this annotation)
+print(sentence)
+```
+
+This should print:
+
+```console
+Sentence: "The grass is green ." → ["green"/color]
+```
+
+The output indicates that the word "green" in this sentence is labeled as a "color". You can also
+iterate through each token and print it to see if it has labels:
+
+```python
+for token in sentence:
+    print(token)
+```
+
+This should print:
+
+```console
+Token[0]: "The"
+Token[1]: "grass"
+Token[2]: "is"
+Token[3]: "green" → color (1.0)
+Token[4]: "."
+```
+
+This shows that there are 5 tokens in the sentence, one of which has a label.
+
+```{note}
+The [`add_label`](#flair.data.DataPoint.add_label) method used here has two mandatory parameters.
+```
+
+### Example 2: Labeling a whole sentence
+
+Sometimes you want to label an entire sentence instead of only a token. Do this by calling [`add_label`](#flair.data.DataPoint.add_label) for the whole sentence.
+
+For example, say we want to add a sentiment label to the sentence "_The grass is green._":
+
+```python
+sentence = Sentence('The grass is green.')
+
+# add a label to a sentence
+sentence.add_label('sentiment', 'POSITIVE')
+
+print(sentence)
+```
+
+This should print:
+
+```
+Sentence[5]: "The grass is green." → POSITIVE (1.0)
+```
+
+Indicating that this sentence is now labeled as having a positive sentiment.
+
+### Multiple labels
+
+Importantly, in Flair you can add as many labels to a sentence as you like.
+
+Let's bring the two examples above together: We will label the sentence "_The grass is green._" with an overall positive sentiment, and also add a "color" tag to the token "grass":
+
+```python
+sentence = Sentence('The grass is green.')
+
+# add a sentiment label to the sentence
+sentence.add_label('sentiment', 'POSITIVE')
+
+# add an NER tag to token 3 in the sentence
+sentence[3].add_label('ner', 'color')
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+This will print:
+
+```
+Sentence[5]: "The grass is green." → POSITIVE (1.0) → ["green"/color]
+```
+
+Indicating that the sentence is now labeled with two different types of information.
+
+### Accessing labels
+
+You can iterate through all labels of a sentence using the [`get_labels()`](#flair.data.Sentence.get_labels) method:
+
+```python
+# iterate over all labels and print
+for label in sentence.get_labels():
+    print(label)
+```
+
+This will get each label and print it. For instance, let's re-use the previous example in which we add two different labels to the same sentence:
+
+```python
+sentence = Sentence('The grass is green.')
+
+# add a sentiment label to the sentence
+sentence.add_label('sentiment', 'POSITIVE')
+
+# add an NER tag to token 3 in the sentence
+sentence[3].add_label('ner', 'color')
+
+# iterate over all labels and print
+for label in sentence.get_labels():
+    print(label)
+```
+
+This will now print the following two lines:
+
+```
+Sentence[5]: "The grass is green." → POSITIVE (1.0)
+Token[3]: "green" → color (1.0)
+```
+
+This printout tells us that there are two labels: The first is for the whole sentence, tagged as POSITIVE. The second is only for the token "green", tagged as "color".
+
+````{note}
+
+If you only want to iterate over labels of a specific type, add the label name as parameter to [`get_labels()`](#flair.data.Sentence.get_labels). For instance, to only iterate over all NER labels, do:
+
+```python
+# iterate over all NER labels only
+for label in sentence.get_labels('ner'):
+    print(label)
+```
+````
+
+### Information for each label
+
+Each label is of class `Label` which next to the value has a score indicating confidence. It also has a pointer back to the data point to which it attaches.
+
+This means that you can print the value, the confidence and the labeled text of each label:
+
+```python
+sentence = Sentence('The grass is green.')
+
+# add an NER tag to token 3 in the sentence
+sentence[3].add_label('ner', 'color')
+
+# iterate over all labels and print
+for label in sentence.get_labels():
+
+    # Print the text, the label value and the label score
+    print(f'"{label.data_point.text}" is classified as "{label.value}" with score {label.score}')
+```
+
+This should print:
+
+```
+"green" is classified as "color" with score 1.0
+```
+
+Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our sequence labeler, the score value will indicate classifier confidence.
+
diff --git a/docs/tutorial/tutorial-basics/entity-linking.md b/docs/tutorial/tutorial-basics/entity-linking.md
new file mode 100644
index 000000000..8137c2dc5
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/entity-linking.md
@@ -0,0 +1,85 @@
+# Tagging and linking entities
+
+As of Flair 0.12 we ship an **experimental entity linker** trained on the [Zelda dataset](https://github.com/flairNLP/zelda). The linker does not only
+tag entities, but also attempts to link each entity to the corresponding Wikipedia URL if one exists. 
+
+## Example 1: Entity linking on a single sentence​
+
+To illustrate, let's use the example sentence "_Kirk and Spock met on the Enterprise._":
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('linker')
+
+# make a sentence
+sentence = Sentence('Kirk and Spock met on the Enterprise.')
+
+# predict entity links
+tagger.predict(sentence)
+
+# iterate over predicted entities and print
+for label in sentence.get_labels():
+    print(label)
+```
+
+This should print:
+```console
+Span[0:1]: "Kirk" → James_T._Kirk (0.9969)
+Span[2:3]: "Spock" → Spock (0.9971)
+Span[6:7]: "Enterprise" → USS_Enterprise_(NCC-1701-D) (0.975)
+```
+
+As we can see, the linker can resolve what the two mentions of "Barcelona" refer to: 
+- "Kirk" refers to the entity "[James_T._Kirk](https://en.wikipedia.org/wiki/James_T._Kirk)"
+- "Spock" refers to "[Spock](https://en.wikipedia.org/wiki/Spock)" (ok, that one was easy)
+- "Enterprise" refers to the "[USS_Enterprise_(NCC-1701-D)](https://en.wikipedia.org/wiki/USS_Enterprise_(NCC-1701-D))" 
+
+ Not bad, eh? However, that last prediction is not quite correct as Star Trek fans will know. Entity linking is a hard task and we are working to improve the accuracy of our model.
+
+
+
+## Example 2: Entity linking on a text document (multiple sentences)
+
+Entity linking typically works best when applied to a whole document instead of only a single sentence.
+
+To illustrate how this works, let's use the following short text: "_Bayern played against Barcelona. The match took place in Barcelona._"
+
+In this case, split the text into sentences and pass a list of Sentence objects to the [`Classifier.predict()`](#flair.nn.Classifier.predict) method:
+
+```python
+from flair.nn import Classifier
+from flair.splitter import SegtokSentenceSplitter
+
+# example text with many sentences
+text = "Bayern played against Barcelona. The match took place in Barcelona."
+
+# initialize sentence splitter
+splitter = SegtokSentenceSplitter()
+
+# use splitter to split text into list of sentences
+sentences = splitter.split(text)
+
+# predict tags for sentences
+tagger = Classifier.load('linker')
+tagger.predict(sentences)
+
+# iterate through sentences and print predicted labels
+for sentence in sentences:
+    print(sentence)
+```
+
+This should print: 
+```console
+Sentence[5]: "Bayern played against Barcelona." → ["Bayern"/FC_Bayern_Munich, "Barcelona"/FC_Barcelona]
+Sentence[7]: "The match took place in Barcelona." → ["Barcelona"/Barcelona]
+```
+
+As we can see, the linker can resolve that:
+
+- "Bayern" refers to the soccer club "[FC Bayern Munich](https://en.wikipedia.org/wiki/FC_Bayern_Munich)"
+- the first mention of "Barcelona" refers to the soccer club "[FC Barcelona](https://en.wikipedia.org/wiki/FC_Barcelona)"
+- the second mention of "Barcelona" refers to the city of "[Barcelona](https://en.wikipedia.org/wiki/Barcelona)"
+
diff --git a/docs/tutorial/tutorial-basics/how-predictions-work.md b/docs/tutorial/tutorial-basics/how-predictions-work.md
new file mode 100644
index 000000000..9911f6efa
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/how-predictions-work.md
@@ -0,0 +1,78 @@
+# How predictions work
+
+All taggers in Flair make predictions. This tutorial helps you understand what information you can get out of each prediction.
+
+## Running example
+
+Let's use our standard NER example to illustrate how annotations work: 
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('ner')
+
+# make a sentence
+sentence = Sentence('George Washington went to Washington.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print the sentence with the tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence: "George Washington went to Washington ." → ["George Washington"/PER, "Washington"/LOC]
+```
+
+Showing us that two entities are labeled in this sentence: "George Washington" as PER (person) and "Washington"
+as LOC (location.)
+
+## Getting the predictions
+
+A common question that gets asked is **how to access these predictions directly**. You can do this by using
+the [`get_labels()`](#flair.data.Sentence.get_labels) method to iterate over all predictions:
+
+```python
+for label in sentence.get_labels():
+    print(label)
+```
+This should print the two NER predictions:
+
+```console
+Span[0:2]: "George Washington" → PER (0.9989)
+Span[4:5]: "Washington" → LOC (0.9942)
+```
+
+As you can see, each entity is printed, together with the predicted class. 
+The confidence of the prediction is indicated as a score in brackets.
+
+## Values for each prediction
+
+For each prediction, you can even **directly access** the label value, and all other attributes of the [`Label`](#flair.data.Label) class:  
+
+```python
+# iterate over all labels in the sentence
+for label in sentence.get_labels():
+    # print label value and score
+    print(f'label.value is: "{label.value}"')
+    print(f'label.score is: "{label.score}"')
+    # access the data point to which label attaches and print its text
+    print(f'the text of label.data_point is: "{label.data_point.text}"\n')
+```
+
+This should print: 
+```console
+label.value is: "PER"
+label.score is: "0.998886227607727"
+the text of label.data_point is: "George Washington"
+
+label.value is: "LOC"
+label.score is: "0.9942097663879395"
+the text of label.data_point is: "Washington"
+```
+
+
diff --git a/docs/tutorial/tutorial-basics/how-to-tag-corpus.md b/docs/tutorial/tutorial-basics/how-to-tag-corpus.md
new file mode 100644
index 000000000..8aa75a402
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/how-to-tag-corpus.md
@@ -0,0 +1,32 @@
+# How to tag a whole corpus
+
+Often, you may want to tag an entire text corpus. In this case, you need to split the corpus into sentences and pass a
+list of [`Sentence`](#flair.data.Sentence) objects to the [`Classifier.predict()`](#flair.nn.Classifier.predict) method.
+
+For instance, you can use a [`SentenceSplitter`](#flair.splitter.SentenceSplitter) to split your text:
+
+```python
+from flair.nn import Classifier
+from flair.splitter import SegtokSentenceSplitter
+
+# example text with many sentences
+text = "This is a sentence. This is another sentence. I love Berlin."
+
+# initialize sentence splitter
+splitter = SegtokSentenceSplitter()
+
+# use splitter to split text into list of sentences
+sentences = splitter.split(text)
+
+# predict tags for sentences
+tagger = Classifier.load('ner')
+tagger.predict(sentences)
+
+# iterate through sentences and print predicted labels
+for sentence in sentences:
+    print(sentence)
+```
+
+Using the `mini_batch_size` parameter of the [`Classifier.predict()`](#flair.nn.Classifier.predict) method, you can set the size of mini batches passed to the
+tagger. Depending on your resources, you might want to play around with this parameter to optimize speed.
+
diff --git a/docs/tutorial/tutorial-basics/index.rst b/docs/tutorial/tutorial-basics/index.rst
new file mode 100644
index 000000000..6e5997023
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/index.rst
@@ -0,0 +1,17 @@
+Tutorial 1: Basic Tagging
+=========================
+
+This tutorial shows you in more detail how to tag your text and access predictions,
+and showcases various models we ship with Flair.
+
+.. toctree::
+   :maxdepth: 1
+
+   basic-types
+   how-predictions-work
+   tagging-entities
+   tagging-sentiment
+   entity-linking
+   part-of-speech-tagging
+   other-models
+   how-to-tag-corpus
diff --git a/docs/tutorial/tutorial-basics/other-models.md b/docs/tutorial/tutorial-basics/other-models.md
new file mode 100644
index 000000000..74c39479b
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/other-models.md
@@ -0,0 +1,153 @@
+# Tagging other things
+
+This tutorial gives you a tour of **other crazy models** shipped with Flair. These include:
+* tagging semantic frames  
+* chunking text
+* relation extraction
+* others
+
+Let's get started! 
+
+## Semantic Frame Detection
+
+For English, we provide a pre-trained model that detects semantic frames in text, trained using Propbank 3.0 frames.
+This provides a sort of word sense disambiguation for frame evoking words, and we are curious what researchers might
+do with this.
+
+Here's an example:
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load model
+tagger = Classifier.load('frame')
+
+# make English sentence
+sentence = Sentence('George returned to Berlin to return his hat.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# go through tokens and print predicted frame (if one is predicted)
+for token in sentence:
+    print(token)
+```
+This should print:
+
+```console
+Token[0]: "George"
+Token[1]: "returned" → return.01 (0.9951)
+Token[2]: "to"
+Token[3]: "Berlin"
+Token[4]: "to"
+Token[5]: "return" → return.02 (0.6361)
+Token[6]: "his"
+Token[7]: "hat"
+Token[8]: "."
+```
+
+As we can see, the frame detector makes a distinction in the sentence between two different meanings of the word 'return'. 'return.01' means returning to a location, while 'return.02' means giving something back.
+
+## Syntactic Chunking
+
+For English, we provide a model for chunking verb and noun phrases, trained using CoNLL 2000. 
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load model
+tagger = Classifier.load('chunk')
+
+# make English sentence
+sentence = Sentence('The quick brown fox jumps over the lazy dog.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print the chunks
+for chunk in sentence.get_labels():
+  print(chunk)
+```
+
+This should print:
+
+```console
+Span[0:4]: "The quick brown fox" → NP (0.9914)
+Span[4:5]: "jumps" → VP (1.0)
+Span[5:6]: "over" → PP (0.9967)
+Span[6:9]: "the lazy dog" → NP (0.9991)
+```
+This tells us for instance that "the quick brown fox" and "the lazy dog" form syntactic units in this sentence.
+
+
+## Tagging Relations
+
+Relations hold between two entities. For instance, a text like "_George was born in Washington_"
+names two entities and also expresses that there is a born_in relationship between
+both.
+
+We added an experimental relation extraction model trained over a modified version of TACRED.
+You must use this model together with an entity tagger. Here is an example:
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# 1. make example sentence
+sentence = Sentence("George was born in Washington")
+
+# 2. load entity tagger and predict entities
+tagger = Classifier.load('ner-fast')
+tagger.predict(sentence)
+
+# check which named entities have been found in the sentence
+entities = sentence.get_labels('ner')
+for entity in entities:
+    print(entity)
+
+# 3. load relation extractor
+extractor = Classifier.load('relations')
+
+# predict relations
+extractor.predict(sentence)
+
+# check which relations have been found
+relations = sentence.get_labels('relation')
+for relation in relations:
+    print(relation)
+print("")
+
+# Use the `get_labels()` method with parameter 'relation' to iterate over all relation predictions. 
+for label in sentence.get_labels('relation'):
+    print(label)
+```
+
+This should print:
+
+```console
+Span[0:1]: "George" → PER (0.9971)
+Span[4:5]: "Washington" → LOC (0.9847)
+
+Relation[0:1][4:5]: "George -> Washington" → born_in (1.0)
+```
+
+Indicating that a **born_in** relationship holds between "George" and "Washington"!
+
+## List of Other Models
+
+We end this section with a list of all other models we currently ship with Flair:
+
+| ID | Task | Language | Training Dataset | Accuracy | Contributor / Notes |
+| -------------    | ------------- |------------- |------------- | ------------- | ------------- |
+| '[chunk](https://huggingface.co/flair/chunk-english)' |  Chunking   |  English | Conll-2000     |  **96.47** (F1) |
+| '[chunk-fast](https://huggingface.co/flair/chunk-english-fast)' |   Chunking   |  English | Conll-2000     |  **96.22** (F1) |(fast model)
+| '[frame](https://huggingface.co/flair/frame-english)'  |   Frame Detection |  English | Propbank 3.0     |  **97.54** (F1) |
+| '[frame-fast](https://huggingface.co/flair/frame-english-fast)'  |  Frame Detection |  English | Propbank 3.0     |  **97.31** (F1) | (fast model)
+| 'negation-speculation'  | Negation / speculation |English |  Bioscope | **80.2** (F1) |
+| 'communicative-functions' |  detecting function of sentence in research paper (BETA) |  English| scholarly papers |  |
+| 'de-historic-indirect' | historical indirect speech | German | @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
+| 'de-historic-direct' | historical direct speech |  German | @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
+| 'de-historic-reported' | historical reported speech | German |  @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
+| 'de-historic-free-indirect' | historical free-indirect speech | German | @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
+
diff --git a/docs/tutorial/tutorial-basics/part-of-speech-tagging.md b/docs/tutorial/tutorial-basics/part-of-speech-tagging.md
new file mode 100644
index 000000000..b19587100
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/part-of-speech-tagging.md
@@ -0,0 +1,170 @@
+# Tagging parts-of-speech
+
+This tutorials shows you how to do part-of-speech tagging in Flair, showcases univeral and language-specific models, and gives a list of all PoS models in Flair.
+
+## Language-specific parts-of-speech (PoS)
+
+
+Syntax is fundamentally language-specific, so each language has different fine-grained parts-of-speech. Flair offers models for many languages:  
+
+### ... in English
+
+For English, we offer several models trained over Ontonotes. 
+
+Use like this:
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('pos')
+
+# make a sentence
+sentence = Sentence('Dirk went to the store.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[6]: "Dirk went to the store." → ["Dirk"/NNP, "went"/VBD, "to"/IN, "the"/DT, "store"/NN, "."/.]
+```
+
+This printout tells us for instance that "_Dirk_" is a proper noun (tag: NNP), and "_went_" is a past tense verb (tag: VBD).
+
+```{note}
+To better understand what each tag means, consult the [tag specification](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) of the Penn Treebank.
+```
+
+### ... in German 
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('de-pos')
+
+# make a sentence
+sentence = Sentence('Dort hatte er einen Hut gekauft.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[7]: "Dort hatte er einen Hut gekauft." → ["Dort"/ADV, "hatte"/VAFIN, "er"/PPER, "einen"/ART, "Hut"/NN, "gekauft"/VVPP, "."/$.]
+```
+
+
+### ... in Ukrainian
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('pos-ukrainian')
+
+# make a sentence
+sentence = Sentence("Сьогодні в Знам’янці проживають нащадки поета — родина Шкоди.")
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+
+### ... in Arabic
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('ar-pos')
+
+# make a sentence
+sentence = Sentence('عمرو عادلي أستاذ للاقتصاد السياسي المساعد في الجامعة الأمريكية  بالقاهرة .')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+## Tagging universal parts-of-speech (uPoS)​
+
+Universal parts-of-speech are a set of minimal syntactic units that exist across languages. For instance, most languages
+will have VERBs or NOUNs. 
+
+
+We ship models trained over 14 langages to tag upos in **multilingual text**. Use like this: 
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load model
+tagger = Classifier.load('pos-multi')
+
+# text with English and German sentences
+sentence = Sentence('George Washington went to Washington. Dort kaufte er einen Hut.')
+
+# predict PoS tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print (line breaks added for readability):
+```console
+Sentence: "George Washington went to Washington . Dort kaufte er einen Hut ."
+
+→ ["George"/PROPN, "Washington"/PROPN, "went"/VERB, "to"/ADP, "Washington"/PROPN, "."/PUNCT]
+
+→ ["Dort"/ADV, "kaufte"/VERB, "er"/PRON, "einen"/DET, "Hut"/NOUN, "."/PUNCT]
+```
+
+However note that they were trained for a mix of European languages and therefore will not work for other languages.
+
+## List of POS Models
+
+We end this section with a list of all models we currently ship with Flair. 
+
+| ID | Task | Language | Training Dataset | Accuracy | Contributor / Notes |
+| -------------    | ------------- |------------- |------------- | ------------- | ------------- |
+| '[pos](https://huggingface.co/flair/pos-english)' |  POS-tagging |   English |  Ontonotes     |**98.19** (Accuracy) |
+| '[pos-fast](https://huggingface.co/flair/pos-english-fast)' |  POS-tagging |   English |  Ontonotes     |  **98.1** (Accuracy) |(fast model)
+| '[upos](https://huggingface.co/flair/upos-english)' |  POS-tagging (universal) | English | Ontonotes     |  **98.6** (Accuracy) |
+| '[upos-fast](https://huggingface.co/flair/upos-english-fast)' |  POS-tagging (universal) | English | Ontonotes     |  **98.47** (Accuracy) | (fast model)
+| '[pos-multi](https://huggingface.co/flair/upos-multi)' |  POS-tagging   |  Multilingual |  UD Treebanks  |  **96.41** (average acc.) |  (12 languages)
+| '[pos-multi-fast](https://huggingface.co/flair/upos-multi-fast)' |  POS-tagging |  Multilingual |  UD Treebanks  |  **92.88** (average acc.) | (12 languages)
+| '[ar-pos](https://huggingface.co/megantosh/flair-arabic-dialects-codeswitch-egy-lev)' | POS-tagging | Arabic (+dialects)| combination of corpora |  | |
+| 'de-pos' | POS-tagging | German | UD German - HDT  |  **98.50** (Accuracy) | |
+| 'de-pos-tweets' | POS-tagging | German | German Tweets  |  **93.06** (Accuracy) | [stefan-it](https://github.com/stefan-it/flair-experiments/tree/master/pos-twitter-german) |
+| 'da-pos' | POS-tagging | Danish | [Danish Dependency Treebank](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md)  |  | [AmaliePauli](https://github.com/AmaliePauli) |
+| 'ml-pos' | POS-tagging | Malayalam | 30000 Malayalam sentences  | **83** | [sabiqueqb](https://github.com/sabiqueqb) |
+| 'ml-upos' | POS-tagging | Malayalam | 30000 Malayalam sentences | **87** | [sabiqueqb](https://github.com/sabiqueqb) |
+| 'pt-pos-clinical' | POS-tagging | Portuguese | [PUCPR](https://github.com/HAILab-PUCPR/portuguese-clinical-pos-tagger) | **92.39** | [LucasFerroHAILab](https://github.com/LucasFerroHAILab) for clinical texts |
+| '[pos-ukrainian](https://huggingface.co/dchaplinsky/flair-uk-pos)' | POS-tagging | Ukrainian |  [Ukrainian UD](https://universaldependencies.org/treebanks/uk_iu/index.html)  | **97.93** (F1)  | [dchaplinsky](https://github.com/dchaplinsky) |
+
+You choose which pre-trained model you load by passing the appropriate string to the [`Classifier.load()`](#flair.nn.Classifier.load) method.
+
+A full list of our current and community-contributed models can be browsed on the [__model hub__](https://huggingface.co/models?library=flair&sort=downloads).
+
+
+
diff --git a/docs/tutorial/tutorial-basics/tagging-entities.md b/docs/tutorial/tutorial-basics/tagging-entities.md
new file mode 100644
index 000000000..f05ce16b2
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/tagging-entities.md
@@ -0,0 +1,202 @@
+# Tagging entities 
+
+This tutorials shows you how to do named entity recognition, showcases various NER models, and provides a full list of all NER models in Flair.
+
+## Tagging entities with our standard model​
+
+Our standard model uses Flair embeddings and was trained over the English CoNLL-03 task and can recognize 4 different entity types. It offers a good tradeoff between accuracy and speed.
+
+As example, let's use the sentence "_George Washington went to Washington._": 
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('ner')
+
+# make a sentence
+sentence = Sentence('George Washington went to Washington.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence: "George Washington went to Washington ." → ["George Washington"/PER, "Washington"/LOC]
+```
+
+The printout tells us that two entities are labeled in this sentence: "George Washington" as PER (person) and "Washington" as LOC (location).
+
+## Tagging entities with our best model​
+
+Our best 4-class model is trained using a very large transformer. Use it if accuracy is the most important to you, and speed/memory not so much. 
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('George Washington went to Washington.')
+
+# load the NER tagger
+tagger = Classifier.load('ner-large')
+
+# run NER over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+As you can see, it's the same code, just with '**ner-large**' as model instead of '**ner**'. 
+This model also works with most languages. 
+
+```{note}
+If you want the fastest model we ship, you can also try 'ner-fast'.
+```
+
+## Tagging entities in non-English text
+
+We also have NER models for text in other languages. 
+
+### Tagging a German sentence
+
+To tag a German sentence, just load the appropriate model:
+
+```python
+
+# load model
+tagger = Classifier.load('de-ner-large')
+
+# make German sentence
+sentence = Sentence('George Washington ging nach Washington.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence: "George Washington ging nach Washington ." → ["George Washington"/PER, "Washington"/LOC]
+```
+
+### Tagging an Arabic sentence
+
+Flair also works for languages that write from right to left. To tag an Arabic sentence, just load the appropriate model:
+
+```python
+
+# load model
+tagger = Classifier.load('ar-ner')
+
+# make Arabic sentence
+sentence = Sentence("احب برلين")
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[2]: "احب برلين" → ["برلين"/LOC]
+```
+
+## Tagging Entities with 18 Classes
+
+We also ship models that distinguish between more than just 4 classes. For instance, use our ontonotes models 
+to classify 18 different types of entities. 
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('On September 1st George won 1 dollar while watching Game of Thrones.')
+
+# load the NER tagger
+tagger = Classifier.load('ner-ontonotes-large')
+
+# run NER over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[13]: "On September 1st George won 1 dollar while watching Game of Thrones." → ["September 1st"/DATE, "George"/PERSON, "1 dollar"/MONEY, "Game of Thrones"/WORK_OF_ART]
+```
+
+Finding for instance that "Game of Thrones" is a work of art and that "September 1st" is a date.
+
+## Biomedical Data
+
+For biomedical data, we offer the hunflair models that detect 5 different types of biomedical entities. 
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome.')
+
+# load the NER tagger
+tagger = Classifier.load('bioner')
+
+# run NER over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[13]: "Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome." → ["Behavioral abnormalities"/Disease, "Fmr1"/Gene, "Mouse"/Species, "Fragile X Syndrome"/Disease]
+```
+
+Thus finding entities of classes "Species", "Disease" and "Gene" in this text.
+
+## List of NER Models
+
+We end this section with a list of all models we currently ship with Flair. 
+
+| ID | Task | Language | Training Dataset | Accuracy | Contributor / Notes |
+| -------------    | ------------- |------------- |------------- | ------------- | ------------- |
+| '[ner](https://huggingface.co/flair/ner-english)' | NER (4-class) |  English | Conll-03  |  **93.03** (F1) |
+| '[ner-fast](https://huggingface.co/flair/ner-english-fast)' | NER (4-class)  |  English  |  Conll-03  |  **92.75** (F1) | (fast model)
+| '[ner-large](https://huggingface.co/flair/ner-english-large)' | NER (4-class)  |  English / Multilingual |  Conll-03  |  **94.09** (F1) | (large model)
+| 'ner-pooled' | NER (4-class)  |  English |  Conll-03  |  **93.24** (F1) | (memory inefficient)
+| '[ner-ontonotes](https://huggingface.co/flair/ner-english-ontonotes)' | NER (18-class) |  English | Ontonotes  |  **89.06** (F1) |
+| '[ner-ontonotes-fast](https://huggingface.co/flair/ner-english-ontonotes-fast)' | NER (18-class) |  English | Ontonotes  |  **89.27** (F1) | (fast model)
+| '[ner-ontonotes-large](https://huggingface.co/flair/ner-english-ontonotes-large)' | NER (18-class) |  English / Multilingual | Ontonotes  |  **90.93** (F1) | (large model)
+| '[ar-ner](https://huggingface.co/megantosh/flair-arabic-multi-ner)' | NER (4-class) | Arabic | AQMAR & ANERcorp (curated) |  **86.66** (F1) | |
+| '[da-ner](https://huggingface.co/flair/ner-danish)' | NER (4-class) | Danish |  [Danish NER dataset](https://github.com/alexandrainst/danlp)  |   | [AmaliePauli](https://github.com/AmaliePauli) |
+| '[de-ner](https://huggingface.co/flair/ner-german)' | NER (4-class) |  German | Conll-03  |  **87.94** (F1) | |
+| '[de-ner-large](https://huggingface.co/flair/ner-german-large)' | NER (4-class) |  German / Multilingual | Conll-03  |  **92.31** (F1) | |
+| 'de-ner-germeval' | NER (4-class) | German | Germeval  |  **84.90** (F1) | |
+| '[de-ner-legal](https://huggingface.co/flair/ner-german-legal)' | NER (legal text) |  German | [LER](https://github.com/elenanereiss/Legal-Entity-Recognition) dataset  |  **96.35** (F1) | |
+| '[fr-ner](https://huggingface.co/flair/ner-french)' | NER (4-class) | French | [WikiNER (aij-wikiner-fr-wp3)](https://github.com/dice-group/FOX/tree/master/input/Wikiner)  |  **95.57** (F1) | [mhham](https://github.com/mhham) |
+| '[es-ner-large](https://huggingface.co/flair/ner-spanish-large)' | NER (4-class) | Spanish | CoNLL-03  |  **90.54** (F1) | [mhham](https://github.com/mhham) |
+| '[nl-ner](https://huggingface.co/flair/ner-dutch)' | NER (4-class) | Dutch |  [CoNLL 2002](https://www.clips.uantwerpen.be/conll2002/ner/)  |  **92.58** (F1) |  |
+| '[nl-ner-large](https://huggingface.co/flair/ner-dutch-large)' | NER (4-class) | Dutch | Conll-03 |  **95.25** (F1) |  |
+| 'nl-ner-rnn' | NER (4-class) | Dutch | [CoNLL 2002](https://www.clips.uantwerpen.be/conll2002/ner/)  |  **90.79** (F1) | |
+| '[ner-ukrainian](https://huggingface.co/dchaplinsky/flair-uk-ner)' | NER (4-class) | Ukrainian |  [NER-UK dataset](https://github.com/lang-uk/ner-uk)  | **86.05** (F1)  | [dchaplinsky](https://github.com/dchaplinsky) |
+
+
+You choose which pre-trained model you load by passing the appropriate string to the [`Classifier.load()`](#flair.nn.Classifier.load) method.
+
+A full list of our current and community-contributed models can be browsed on the [__model hub__](https://huggingface.co/models?library=flair&sort=downloads).
+
diff --git a/docs/tutorial/tutorial-basics/tagging-sentiment.md b/docs/tutorial/tutorial-basics/tagging-sentiment.md
new file mode 100644
index 000000000..1e4dd688a
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/tagging-sentiment.md
@@ -0,0 +1,79 @@
+# Tagging sentiment
+
+This tutorials shows you how to do sentiment analysis in Flair.
+
+## Tagging sentiment with our standard model​
+
+Our standard sentiment analysis model uses distilBERT embeddings and was trained over a mix of corpora, notably
+the Amazon review corpus, and can thus handle a variety of domains and language.
+
+Let's use an example sentence:
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('sentiment')
+
+# make a sentence
+sentence = Sentence('This movie is not at all bad.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[8]: "This movie is not at all bad." → POSITIVE (0.9929)
+```
+
+Showing us that the sentence overall is tagged to be of POSITIVE sentiment. 
+
+## Tagging sentiment with our fast model
+
+We also offer an RNN-based variant which is faster but less accurate. Use it like this: 
+
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('sentiment-fast')
+
+# make a sentence
+sentence = Sentence('This movie is very bad.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[6]: "This movie is very bad." → NEGATIVE (0.9999)
+```
+
+This indicates that the sentence is of NEGATIVE sentiment. As you can see, its the same code as above, just loading the
+'**sentiment-fast**' model instead of '**sentiment**'.
+
+
+### List of Sentiment Models
+
+We end this section with a list of all models we currently ship with Flair:
+
+| ID | Language | Task | Training Dataset | Accuracy |
+| ------------- | ---- | ------------- |------------- |------------- |
+| 'sentiment' | English | detecting positive and negative sentiment (transformer-based) | movie and product reviews |  **98.87** |
+| 'sentiment-fast' | English | detecting positive and negative sentiment (RNN-based) | movie and product reviews |  **96.83**|
+| 'de-offensive-language' | German | detecting offensive language | [GermEval 2018 Task 1](https://projects.fzai.h-da.de/iggsa/projekt/) |  **75.71** (Macro F1) |
+
+
+
+
diff --git a/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md b/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md
new file mode 100644
index 000000000..817190071
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md
@@ -0,0 +1,115 @@
+# Classic Word Embeddings
+
+Classic word embeddings are static and word-level, meaning that each distinct word gets exactly one pre-computed
+embedding. Most embeddings fall under this class, including the popular GloVe or Komninos embeddings.
+
+Simply instantiate the [`WordEmbeddings`](#flair.embeddings.token.WordEmbeddings) class and pass a string identifier of the embedding you wish to load. So, if
+you want to use GloVe embeddings, pass the string 'glove' to the constructor:
+
+```python
+from flair.embeddings import WordEmbeddings
+
+# init embedding
+glove_embedding = WordEmbeddings('glove')
+```
+Now, create an example sentence and call the embedding's [`embed()`](#flair.embeddings.base.Embeddings.embed) method. You can also pass a list of sentences to
+this method since some embedding types make use of batching to increase speed.
+
+```python
+# create sentence.
+sentence = Sentence('The grass is green .')
+
+# embed a sentence using glove.
+glove_embedding.embed(sentence)
+
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+
+This prints out the tokens and their embeddings. GloVe embeddings are Pytorch vectors of dimensionality 100.
+
+You choose which pre-trained embeddings you load by passing the appropriate
+id string to the constructor of the [`WordEmbeddings`](#flair.embeddings.token.WordEmbeddings) class. Typically, you use
+the **two-letter language code** to init an embedding, so 'en' for English and
+'de' for German and so on. By default, this will initialize FastText embeddings trained over Wikipedia.
+You can also always use FastText embeddings over Web crawls, by instantiating with '-crawl'. So 'de-crawl'
+to use embeddings trained over German web crawls.
+
+For English, we provide a few more options, so
+here you can choose between instantiating 'en-glove', 'en-extvec' and so on.
+
+The following embeddings are currently supported:
+
+| ID | Language | Embedding |
+| ------------- | -------------  | ------------- |
+| 'en-glove' (or 'glove') | English | GloVe embeddings |
+| 'en-extvec' (or 'extvec') | English |Komninos embeddings |
+| 'en-crawl' (or 'crawl')  | English | FastText embeddings over Web crawls |
+| 'en-twitter' (or 'twitter')  | English | Twitter embeddings |
+| 'en-turian' (or 'turian')  | English | Turian embeddings (small) |
+| 'en' (or 'en-news' or 'news')  |English | FastText embeddings over news and wikipedia data |
+| 'de' | German |German FastText embeddings |
+| 'nl' | Dutch | Dutch FastText embeddings |
+| 'fr' | French | French FastText embeddings |
+| 'it' | Italian | Italian FastText embeddings |
+| 'es' | Spanish | Spanish FastText embeddings |
+| 'pt' | Portuguese | Portuguese FastText embeddings |
+| 'ro' | Romanian | Romanian FastText embeddings |
+| 'ca' | Catalan | Catalan FastText embeddings |
+| 'sv' | Swedish | Swedish FastText embeddings |
+| 'da' | Danish | Danish FastText embeddings |
+| 'no' | Norwegian | Norwegian FastText embeddings |
+| 'fi' | Finnish | Finnish FastText embeddings |
+| 'pl' | Polish | Polish FastText embeddings |
+| 'cz' | Czech | Czech FastText embeddings |
+| 'sk' | Slovak | Slovak FastText embeddings |
+| 'sl' | Slovenian | Slovenian FastText embeddings |
+| 'sr' | Serbian | Serbian FastText embeddings |
+| 'hr' | Croatian | Croatian FastText embeddings |
+| 'bg' | Bulgarian | Bulgarian FastText embeddings |
+| 'ru' | Russian | Russian FastText embeddings |
+| 'ar' | Arabic | Arabic FastText embeddings |
+| 'he' | Hebrew | Hebrew FastText embeddings |
+| 'tr' | Turkish | Turkish FastText embeddings |
+| 'fa' | Persian | Persian FastText embeddings |
+| 'ja' | Japanese | Japanese FastText embeddings |
+| 'ko' | Korean | Korean FastText embeddings |
+| 'zh' | Chinese | Chinese FastText embeddings |
+| 'hi' | Hindi | Hindi FastText embeddings |
+| 'id' | Indonesian | Indonesian FastText embeddings |
+| 'eu' | Basque | Basque FastText embeddings |
+
+So, if you want to load German FastText embeddings, instantiate as follows:
+
+```python
+german_embedding = WordEmbeddings('de')
+```
+
+Alternatively, if you want to load German FastText embeddings trained over crawls, instantiate as follows:
+
+```python
+german_embedding = WordEmbeddings('de-crawl')
+```
+
+We generally recommend the FastText embeddings, or GloVe if you want a smaller model.
+
+If you want to use any other embeddings (not listed in the list above), you can load those by calling
+```python
+custom_embedding = WordEmbeddings('path/to/your/custom/embeddings.gensim')
+```
+If you want to load custom embeddings you need to make sure that the custom embeddings are correctly formatted to
+[gensim](https://radimrehurek.com/gensim/models/word2vec.html).
+
+You can, for example, convert [FastText embeddings](https://fasttext.cc/docs/en/crawl-vectors.html) to gensim using the
+following code snippet:
+```python
+import gensim
+
+word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/path/to/fasttext/embeddings.txt', binary=False)
+word_vectors.save('/path/to/converted')
+```
+
+However, FastText embeddings have the functionality of returning vectors for out of vocabulary words using the sub-word information. If you want to use this then try [`FastTextEmbeddings`](#flair.embeddings.token.FastTextEmbeddings) class.
+
diff --git a/docs/tutorial/tutorial-embeddings/embeddings.md b/docs/tutorial/tutorial-embeddings/embeddings.md
new file mode 100644
index 000000000..dfb78344b
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/embeddings.md
@@ -0,0 +1,138 @@
+# Embeddings
+
+This tutorial shows you how to use Flair to produce **embeddings** for words and documents. Embeddings
+are vector representations that are useful for a variety of reasons. All Flair models are trained on 
+top of embeddings, so if you want to train your own models, you should understand how embeddings work.
+
+## Example 1: Embeddings Words with Transformers
+
+Let's use a standard BERT model (bert-base-uncased) to embed the sentence "the grass is green".
+
+Simply instantiate [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) and call [`embed()`](#flair.embeddings.base.Embeddings.embed) over an example sentence: 
+
+```python
+from flair.embeddings import TransformerWordEmbeddings
+from flair.data import Sentence
+
+# init embedding
+embedding = TransformerWordEmbeddings('bert-base-uncased')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+This will cause **each word in the sentence** to be embedded. You can iterate through the words and get 
+each embedding like this:
+
+```python
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+
+This will print each token as a long PyTorch vector: 
+```console
+Token[0]: "The"
+tensor([-0.0323, -0.3904, -1.1946,  0.1296,  0.5806, ..], device='cuda:0')
+Token[1]: "grass"
+tensor([-0.3973,  0.2652, -0.1337,  0.4473,  1.1641, ..], device='cuda:0')
+Token[2]: "is"
+tensor([ 0.1374, -0.3688, -0.8292, -0.4068,  0.7717, ..], device='cuda:0')
+Token[3]: "green"
+tensor([-0.7722, -0.1152,  0.3661,  0.3570,  0.6573, ..], device='cuda:0')
+Token[4]: "."
+tensor([ 0.1441, -0.1772, -0.5911,  0.2236, -0.0497, ..], device='cuda:0')
+```
+
+*(Output truncated for readability, actually the vectors are much longer.)*
+
+Transformer word embeddings are the most important concept in Flair. Check out more info in this dedicated chapter.
+
+## Example 2: Embeddings Documents with Transformers
+
+Sometimes you want to have an **embedding for a whole document**, not only individual words. In this case, use one of the 
+DocumentEmbeddings classes in Flair. 
+
+Let's again use a standard BERT model to get an embedding for the entire sentence "the grass is green":  
+
+```python
+from flair.embeddings import TransformerDocumentEmbeddings
+from flair.data import Sentence
+
+# init embedding
+embedding = TransformerDocumentEmbeddings('bert-base-uncased')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+Now, the whole sentence is embedded. Print the embedding like this: 
+
+```python
+# now check out the embedded sentence
+print(sentence.embedding)
+```
+
+[`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) are the most important concept in Flair. Check out more info in [this](project:transformer-embeddings.md) dedicated chapter.
+
+
+## How to Stack Embeddings
+
+Flair allows you to combine embeddings into "embedding stacks". When not fine-tuning, using combinations of embeddings often gives best results!
+
+Use the [`StackedEmbeddings`](#flair.embeddings.token.StackedEmbeddings) class and instantiate it by passing a list of embeddings that you wish to combine. For instance, lets combine classic GloVe [`WordEmbeddings`](#flair.embeddings.token.WordEmbeddings) with forward and backward [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings). 
+
+First, instantiate the two embeddings you wish to combine:
+
+```python
+from flair.embeddings import WordEmbeddings, FlairEmbeddings
+
+# init standard GloVe embedding
+glove_embedding = WordEmbeddings('glove')
+
+# init Flair forward and backwards embeddings
+flair_embedding_forward = FlairEmbeddings('news-forward')
+flair_embedding_backward = FlairEmbeddings('news-backward')
+```
+
+Now instantiate the [`StackedEmbeddings`](#flair.embeddings.token.StackedEmbeddings) class and pass it a list containing these two embeddings.
+
+```python
+from flair.embeddings import StackedEmbeddings
+
+# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
+stacked_embeddings = StackedEmbeddings([
+                                        glove_embedding,
+                                        flair_embedding_forward,
+                                        flair_embedding_backward,
+                                       ])
+```
+
+
+That's it! Now just use this embedding like all the other embeddings, i.e. call the [`embed()`](#flair.embeddings.base.Embeddings.embed) method over your sentences.
+
+```python
+sentence = Sentence('The grass is green .')
+
+# just embed a sentence using the StackedEmbedding as you would with any single embedding.
+stacked_embeddings.embed(sentence)
+
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+
+Words are now embedded using a concatenation of three different embeddings. This means that the resulting embedding
+vector is still a single PyTorch vector.
+
+
+
+
diff --git a/docs/tutorial/tutorial-embeddings/flair-embeddings.md b/docs/tutorial/tutorial-embeddings/flair-embeddings.md
new file mode 100644
index 000000000..817928a2a
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/flair-embeddings.md
@@ -0,0 +1,142 @@
+# Flair embeddings
+
+Contextual string embeddings are [powerful embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)
+ that capture latent syntactic-semantic information that goes beyond
+standard word embeddings. Key differences are:
+1) they are trained without any explicit notion of words and thus fundamentally model words as sequences of characters.
+2) they are **contextualized** by their surrounding text, meaning that the *same word will have different embeddings depending on its contextual use*.
+
+With Flair, you can use these embeddings simply by instantiating the appropriate embedding class, same as standard word embeddings:
+
+```python
+from flair.embeddings import FlairEmbeddings
+
+# init embedding
+flair_embedding_forward = FlairEmbeddings('news-forward')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+flair_embedding_forward.embed(sentence)
+```
+
+You choose which embeddings you load by passing the appropriate string to the constructor of the [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings) class.
+Currently, the following contextual string embeddings are provided (note: replace '*X*' with either '*forward*' or '*backward*'):
+
+| ID                      | Language                                        | Embedding                                                                                                                                                                                                                                    |
+|-------------------------|-------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| 'multi-X'               | 300+                                            | [JW300 corpus](http://opus.nlpl.eu/JW300.php), as proposed by [Agić and Vulić (2019)](https://www.aclweb.org/anthology/P19-1310/). The corpus is licensed under CC-BY-NC-SA                                                                  
+| 'multi-X-fast'          | English, German, French, Italian, Dutch, Polish | Mix of corpora (Web, Wikipedia, Subtitles, News), CPU-friendly                                                                                                                                                                               |
+| 'news-X'                | English                                         | Trained with 1 billion word corpus                                                                                                                                                                                                           |
+| 'news-X-fast'           | English                                         | Trained with 1 billion word corpus, CPU-friendly                                                                                                                                                                                             |
+| 'mix-X'                 | English                                         | Trained with mixed corpus (Web, Wikipedia, Subtitles)                                                                                                                                                                                        |
+| 'ar-X'                  | Arabic                                          | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'bg-X'                  | Bulgarian                                       | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'bg-X-fast'             | Bulgarian                                       | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or SETimes)                                                                                                                 |
+| 'cs-X'                  | Czech                                           | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'cs-v0-X'               | Czech                                           | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version)                                                                                                                                               |
+| 'de-X'                  | German                                          | Trained with mixed corpus (Web, Wikipedia, Subtitles)                                                                                                                                                                                        |
+| 'de-historic-ha-X'      | German (historical)                             | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Hamburger Anzeiger*                                                                                                                           |
+| 'de-historic-wz-X'      | German (historical)                             | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Wiener Zeitung*                                                                                                                               |
+| 'de-historic-rw-X'      | German (historical)                             | Added by [@redewiedergabe](https://github.com/redewiedergabe): Historical German trained over 100 million tokens                                                                                                                             |
+| 'es-X'                  | Spanish                                         | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia                                                                                                                                             |
+| 'es-X-fast'             | Spanish                                         | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia, CPU-friendly                                                                                                                               |
+| 'es-clinical-'          | Spanish (clinical)                              | Added by [@matirojasg](https://github.com/flairNLP/flair/issues/2292): Trained with Wikipedia                                                                                                                                                |
+| 'eu-X'                  | Basque                                          | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'eu-v0-X'               | Basque                                          | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version)                                                                                                                                               |
+| 'fa-X'                  | Persian                                         | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'fi-X'                  | Finnish                                         | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'fr-X'                  | French                                          | Added by [@mhham](https://github.com/mhham): Trained with French Wikipedia                                                                                                                                                                   |
+| 'he-X'                  | Hebrew                                          | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'hi-X'                  | Hindi                                           | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'hr-X'                  | Croatian                                        | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'id-X'                  | Indonesian                                      | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'it-X'                  | Italian                                         | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'ja-X'                  | Japanese                                        | Added by [@frtacoa](https://github.com/zalandoresearch/flair/issues/527): Trained with 439M words of Japanese Web crawls (2048 hidden states, 2 layers)                                                                                      |
+| 'nl-X'                  | Dutch                                           | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'nl-v0-X'               | Dutch                                           | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version)                                                                                                                                               |
+| 'no-X'                  | Norwegian                                       | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'pl-X'                  | Polish                                          | Added by [@borchmann](https://github.com/applicaai/poleval-2018): Trained with web crawls (Polish part of CommonCrawl)                                                                                                                       |
+| 'pl-opus-X'             | Polish                                          | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'pt-X'                  | Portuguese                                      | Added by [@ericlief](https://github.com/ericlief/language_models): LM embeddings                                                                                                                                                             |
+| 'sl-X'                  | Slovenian                                       | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'sl-v0-X'               | Slovenian                                       | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia and OpenSubtitles2018)                                                                                                      |
+| 'sv-X'                  | Swedish                                         | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'sv-v0-X'               | Swedish                                         | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or OpenSubtitles2018)                                                                                                       |
+| 'ta-X'                  | Tamil                                           | Added by [@stefan-it](https://github.com/stefan-it/plur)                                                                                                                                                                                     |
+| 'pubmed-X'              | English                                         | Added by [@jessepeng](https://github.com/zalandoresearch/flair/pull/519): Trained with 5% of PubMed abstracts until 2015 (1150 hidden states, 3 layers)                                                                                      |
+| 'de-impresso-hipe-v1-X' | German (historical)                             | In-domain data (Swiss and Luxembourgish newspapers) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
+| 'en-impresso-hipe-v1-X' | English (historical)                            | In-domain data (Chronicling America material) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU)       |
+| 'fr-impresso-hipe-v1-X' | French (historical)                             | In-domain data (Swiss and Luxembourgish newspapers) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
+| 'am-X'                  | Amharic                                         | Based on 6.5m Amharic text corpus crawled from different sources. See [this paper](https://www.mdpi.com/1999-5903/13/11/275) and the official [GitHub Repository](https://github.com/uhh-lt/amharicmodels) for more information.             |
+| 'uk-X'                  | Ukrainian                                       | Added by [@dchaplinsky](https://github.com/dchaplinsky): Trained with [UberText](https://lang.org.ua/en/corpora/) corpus.                                                                                                                    |
+
+So, if you want to load embeddings from the German forward LM model, instantiate the method as follows:
+
+```python
+flair_de_forward = FlairEmbeddings('de-forward')
+```
+
+And if you want to load embeddings from the Bulgarian backward LM model, instantiate the method as follows:
+
+```python
+flair_bg_backward = FlairEmbeddings('bg-backward')
+```
+
+## Recommended Flair usage
+
+We recommend combining both forward and backward Flair embeddings. Depending on the task, we also recommend adding standard [`WordEmbeddings`](#flair.embeddings.token.WordEmbeddings) into the mix. So, our recommended [`StackedEmbeddings`](#flair.embeddings.token.StackedEmbeddings) for most English tasks is:
+
+
+```python
+from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
+
+# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
+stacked_embeddings = StackedEmbeddings([
+                                        WordEmbeddings('glove'),
+                                        FlairEmbeddings('news-forward'),
+                                        FlairEmbeddings('news-backward'),
+                                       ])
+```
+
+That's it! Now just use this embedding like all the other embeddings, i.e. call the [`embed()`](#flair.embeddings.base.Embeddings.embed) method over your sentences.
+
+```python
+sentence = Sentence('The grass is green .')
+
+# just embed a sentence using the StackedEmbedding as you would with any single embedding.
+stacked_embeddings.embed(sentence)
+
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+Words are now embedded using a concatenation of three different embeddings. This combination often gives state-of-the-art accuracy.
+
+
+## Pooled Flair embeddings
+
+We also developed a pooled variant of the [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings). These embeddings differ in that they *constantly evolve over time*, even at prediction time (i.e. after training is complete). This means that the same words in the same sentence at two different points in time may have different embeddings.
+
+[`PooledFlairEmbeddings`](#flair.embeddings.token.PooledFlairEmbeddings) manage a 'global' representation of each distinct word by using a pooling operation of all past occurences. More details on how this works may be found in [Akbik et al. (2019)](https://www.aclweb.org/anthology/N19-1078/).
+
+You can instantiate and use [`PooledFlairEmbeddings`](#flair.embeddings.token.PooledFlairEmbeddings) like [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings):
+
+```python
+from flair.embeddings import PooledFlairEmbeddings
+
+# init embedding
+flair_embedding_forward = PooledFlairEmbeddings('news-forward')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+flair_embedding_forward.embed(sentence)
+```
+
+Note that while we get some of our best results with [`PooledFlairEmbeddings`](#flair.embeddings.token.PooledFlairEmbeddings) they are very ineffective memory-wise since they keep past embeddings of all words in memory. In many cases, regular [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings) will be nearly as good but with much lower memory requirements.
+
+
diff --git a/docs/tutorial/tutorial-embeddings/index.rst b/docs/tutorial/tutorial-embeddings/index.rst
new file mode 100644
index 000000000..71c4acb34
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/index.rst
@@ -0,0 +1,16 @@
+Tutorial 3: Embeddings
+======================
+
+This tutorial shows you how to use Flair to produce embeddings for words and documents.
+Embeddings are vector representations that are useful for a variety of reasons.
+All Flair models are trained on top of embeddings, so if you want to train your own models,
+you should understand how embeddings work.
+
+.. toctree::
+   :maxdepth: 1
+
+   embeddings
+   transformer-embeddings
+   flair-embeddings
+   classic-word-embeddings
+   other-embeddings
diff --git a/docs/tutorial/tutorial-embeddings/other-embeddings.md b/docs/tutorial/tutorial-embeddings/other-embeddings.md
new file mode 100644
index 000000000..814cf0f8d
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/other-embeddings.md
@@ -0,0 +1,264 @@
+# Other embeddings in Flair
+
+Flair supports many other embedding types. This section introduces these embeddings.
+
+```{note}
+We mostly train our models with either [`TransformerEmbeddings`](#flair.embeddings.transformer.TransformerEmbeddings) or [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings). The embeddings presented here might be useful 
+for specific use cases or for comparison purposes. 
+```
+
+
+## One-Hot Embeddings
+
+[`OneHotEmbeddings`](#flair.embeddings.token.OneHotEmbeddings) are embeddings that encode each word in a vocabulary as a one-hot vector, followed by an embedding
+layer. These embeddings
+thus do not encode any prior knowledge as do most other embeddings. They also differ in that they
+require to see a vocabulary (`vocab_dictionary`) during instantiation. Such dictionary can be passed as an argument
+during class initialization or constructed directly from a corpus with a [`OneHotEmbeddings.from_corpus`](#flair.embeddings.token.OneHotEmbeddings.from_corpus) method. The dictionary consists
+of all unique tokens contained in the corpus plus an UNK token for all rare words.
+
+You initialize these embeddings like this:
+
+```python
+from flair.embeddings import OneHotEmbeddings
+from flair.datasets import UD_ENGLISH
+from flair.data import Sentence
+
+# load a corpus
+corpus = UD_ENGLISH()
+
+# init embedding
+embeddings = OneHotEmbeddings.from_corpus(corpus)
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embeddings.embed(sentence)
+```
+
+By default, the 'text' of a token (i.e. its lexical value) is one-hot encoded and the embedding layer has a dimensionality
+of 300. However, this layer is randomly initialized, meaning that these embeddings do not make sense unless they are trained in a task.
+
+### Vocabulary size
+
+By default, all words that occur in the corpus at least 3 times are part of the vocabulary. You can change
+this using the `min_freq` parameter. For instance, if your corpus is very large you might want to set a
+higher `min_freq`:
+
+```python
+embeddings = OneHotEmbeddings.from_corpus(corpus, min_freq=10)
+```
+
+### Embedding dimensionality
+
+By default, the embeddings have a dimensionality of 300. If you want to try higher or lower values, you can use the
+`embedding_length` parameter:
+
+```python
+embeddings = OneHotEmbeddings.from_corpus(corpus, embedding_length=100)
+```
+
+
+### Embedding other tags
+
+Sometimes, you want to embed something other than text. For instance, sometimes we have part-of-speech tags or
+named entity annotation available that we might want to use. If this field exists in your corpus, you can embed
+it by passing the field variable. For instance, the UD corpora have a universal part-of-speech tag for each
+token ('upos'). Embed it like so:
+
+```python
+from flair.datasets import UD_ENGLISH
+from flair.embeddings import OneHotEmbeddings
+
+# load corpus
+corpus = UD_ENGLISH()
+
+# embed POS tags
+embeddings = OneHotEmbeddings.from_corpus(corpus, field='upos')
+```
+
+This should print a vocabulary of size 18 consisting of universal part-of-speech tags.
+
+
+## Byte Pair Embeddings
+
+[`BytePairEmbeddings`](#flair.embeddings.token.BytePairEmbeddings) are word embeddings that are precomputed on the subword-level. This means that they are able to
+embed any word by splitting words into subwords and looking up their embeddings. `BytePairEmbeddings` were proposed
+and computed by [Heinzerling and Strube (2018)](https://www.aclweb.org/anthology/L18-1473) who found that they offer nearly the same accuracy as word embeddings, but at a fraction
+of the model size. So they are a great choice if you want to train small models.
+
+You initialize with a language code (275 languages supported), a number of 'syllables' (one of ) and
+a number of dimensions (one of 50, 100, 200 or 300). The following initializes and uses byte pair embeddings
+for English:
+
+```python
+from flair.embeddings import BytePairEmbeddings
+
+# init embedding
+embedding = BytePairEmbeddings('en')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+More information can be found
+on the [byte pair embeddings](https://nlp.h-its.org/bpemb/) web page.
+
+[`BytePairEmbeddings`](#flair.embeddings.token.BytePairEmbeddings) also have a multilingual model capable of embedding any word in any language.
+ You can instantiate it with:
+
+```python
+# init embedding
+embedding = BytePairEmbeddings('multi')
+```
+
+You can also load custom [`BytePairEmbeddings`](#flair.embeddings.token.BytePairEmbeddings) by specifying a path to model_file_path and embedding_file_path arguments. They correspond respectively to a SentencePiece model file and to an embedding file (Word2Vec plain text or GenSim binary). For example:
+
+```python
+# init custom embedding
+embedding = BytePairEmbeddings(model_file_path='your/path/m.model', embedding_file_path='your/path/w2v.txt')
+```
+
+## Document Pool Embeddings
+
+[`DocumentPoolEmbeddings`](#flair.embeddings.document.DocumentPoolEmbeddings) calculate a pooling operation over all word embeddings in a document.
+The default operation is `mean` which gives us the mean of all words in the sentence.
+The resulting embedding is taken as document embedding.
+
+To create a mean document embedding simply create any number of [`TokenEmbeddings`](#flair.embeddings.base.TokenEmbeddings) first and put them in a list.
+Afterwards, initiate the [`DocumentPoolEmbeddings`](#flair.embeddings.document.DocumentPoolEmbeddings) with this list of [`TokenEmbeddings`](#flair.embeddings.base.TokenEmbeddings).
+So, if you want to create a document embedding using GloVe embeddings together with [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings),
+use the following code:
+
+```python
+from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
+
+# initialize the word embeddings
+glove_embedding = WordEmbeddings('glove')
+
+# initialize the document embeddings, mode = mean
+document_embeddings = DocumentPoolEmbeddings([glove_embedding])
+```
+
+Now, create an example sentence and call the embedding's [`embed()`](#flair.embeddings.base.Embeddings.embed) method.
+
+```python
+# create an example sentence
+sentence = Sentence('The grass is green . And the sky is blue .')
+
+# embed the sentence with our document embedding
+document_embeddings.embed(sentence)
+
+# now check out the embedded sentence.
+print(sentence.embedding)
+```
+
+This prints out the embedding of the document. Since the document embedding is derived from word embeddings, its dimensionality depends on the dimensionality of word embeddings you are using.
+
+You have the following optional constructor arguments:
+
+| Argument             | Default             | Description
+| -------------------- | ------------------- | ------------------------------------------------------------------------------
+| `fine_tune_mode`             | `linear`       | One of `linear`, `nonlinear` and `none`.
+| `pooling`  | `first`             | One of `mean`, `max` and `min`.
+
+### Pooling operation
+
+Next to the `mean` pooling operation you can also use `min` or `max` pooling. Simply pass the pooling operation you want
+to use to the initialization of the `DocumentPoolEmbeddings`:
+```python
+document_embeddings = DocumentPoolEmbeddings([glove_embedding],  pooling='min')
+```
+
+### Fine-tune mode
+
+You can also choose which fine-tuning operation you want, i.e. which transformation to apply before word embeddings get
+pooled. The default operation is 'linear' transformation, but if you only use simple word embeddings that are
+not task-trained you should probably use a 'nonlinear' transformation instead:
+
+```python
+# instantiate pre-trained word embeddings
+embeddings = WordEmbeddings('glove')
+
+# document pool embeddings
+document_embeddings = DocumentPoolEmbeddings([embeddings], fine_tune_mode='nonlinear')
+```
+
+If on the other hand you use word embeddings that are task-trained (such as simple one hot encoded embeddings), you
+are often better off doing no transformation at all. Do this by passing 'none':
+
+```python
+# instantiate one-hot encoded word embeddings
+embeddings = OneHotEmbeddings(corpus)
+
+# document pool embeddings
+document_embeddings = DocumentPoolEmbeddings([embeddings], fine_tune_mode='none')
+```
+
+## Document RNN Embeddings
+
+Besides simple pooling we also support a method based on an RNN to obtain a [`DocumentEmbeddings`](#flair.embeddings.base.DocumentEmbeddings).
+The RNN takes the word embeddings of every token in the document as input and provides its last output state as document
+embedding. You can choose which type of RNN you wish to use.
+
+In order to use the [`DocumentRNNEmbeddings`](#flair.embeddings.document.DocumentRNNEmbeddings) you need to initialize them by passing a list of token embeddings to it:
+
+```python
+from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings
+
+glove_embedding = WordEmbeddings('glove')
+
+document_embeddings = DocumentRNNEmbeddings([glove_embedding])
+```
+
+By default, a GRU-type RNN is instantiated. Now, create an example sentence and call the embedding's [`embed()`](#flair.embeddings.base.Embeddings.embed) method.
+
+```python
+# create an example sentence
+sentence = Sentence('The grass is green . And the sky is blue .')
+
+# embed the sentence with our document embedding
+document_embeddings.embed(sentence)
+
+# now check out the embedded sentence.
+print(sentence.get_embedding())
+```
+
+This will output a single embedding for the complete sentence. The embedding dimensionality depends on the number of
+hidden states you are using and whether the RNN is bidirectional or not.
+
+### RNN type
+
+If you want to use a different type of RNN, you need to set the `rnn_type` parameter in the constructor. So,
+to initialize a document RNN embedding with an LSTM, do:
+
+```python
+from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings
+
+glove_embedding = WordEmbeddings('glove')
+
+document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding], rnn_type='LSTM')
+```
+
+### Need to be trained on a task
+
+Note that while [`DocumentPoolEmbeddings`](#flair.embeddings.document.DocumentPoolEmbeddings) are immediately meaningful, [`DocumentRNNEmbeddings`](#flair.embeddings.document.DocumentRNNEmbeddings) need to be tuned on the
+downstream task. This happens automatically in Flair if you train a new model with these embeddings. 
+
+Once the model is trained, you can access the tuned [`DocumentRNNEmbeddings`](#flair.embeddings.document.DocumentRNNEmbeddings) object directly from the classifier object and use it to embed sentences.
+
+```python
+document_embeddings = classifier.document_embeddings
+
+sentence = Sentence('The grass is green . And the sky is blue .')
+
+document_embeddings.embed(sentence)
+
+print(sentence.get_embedding())
+```
+
+[`DocumentRNNEmbeddings`](#flair.embeddings.document.DocumentRNNEmbeddings) have a number of hyperparameters that can be tuned, please take a look at their [API docs](#flair.embeddings.document.DocumentRNNEmbeddings) to find out more.
diff --git a/docs/tutorial/tutorial-embeddings/transformer-embeddings.md b/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
new file mode 100644
index 000000000..eae5eb35b
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
@@ -0,0 +1,180 @@
+# Transformer embeddings
+
+Flair supports various Transformer-based architectures like BERT or XLNet from [HuggingFace](https://github.com/huggingface), 
+with two classes [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) (to embed words) and [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) (to embed documents).
+
+## Embeddings words 
+
+For instance, to load a standard BERT transformer model, do:
+
+```python
+from flair.embeddings import TransformerWordEmbeddings
+
+# init embedding
+embedding = TransformerWordEmbeddings('bert-base-uncased')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+If instead you want to use RoBERTa, do:
+
+```python
+from flair.embeddings import TransformerWordEmbeddings
+
+# init embedding
+embedding = TransformerWordEmbeddings('roberta-base')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+Use the [Huggingface Model hub](https://huggingface.co/models) to find any open source text embedding model to use.
+
+
+## Embeddings sentences
+
+To embed a whole sentence as one (instead of each word in the sentence), simply use the [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) 
+instead:
+
+```python
+from flair.embeddings import TransformerDocumentEmbeddings
+
+# init embedding
+embedding = TransformerDocumentEmbeddings('roberta-base')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+## Arguments
+
+There are several options that you can set when you init the [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) 
+and [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) classes:
+
+| Argument               | Default             | Description                                                                                                                                
+|------------------------|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------
+| `model`                | `bert-base-uncased` | The string identifier of the transformer model you want to use (see above)                                                                 |
+| `layers`               | `all`               | Defines the layers of the Transformer-based model that produce the embedding                                                               |
+| `subtoken_pooling`     | `first`             | See [Pooling operation section](#pooling).                                                                                                 |
+| `layer_mean`           | `True`              | See [Layer mean section](#layer-mean).                                                                                                     |
+| `fine_tune`            | `False`             | Whether or not embeddings are fine-tuneable.                                                                                               |
+| `allow_long_sentences` | `True`              | Whether or not texts longer than maximal sequence length are supported.                                                                    |
+| `use_context`          | `False`             | Set to True to include context outside of sentences. This can greatly increase accuracy on some tasks, but slows down embedding generation |
+
+
+### Layers
+
+The `layers` argument controls which transformer layers are used for the embedding. If you set this value to '-1,-2,-3,-4', the top 4 layers are used to make an embedding. If you set it to '-1', only the last layer is used. If you set it to "all", then all layers are used.
+
+This affects the length of an embedding, since layers are just concatenated.
+
+```python
+from flair.data import Sentence
+from flair.embeddings import TransformerWordEmbeddings
+
+sentence = Sentence('The grass is green.')
+
+# use only last layers
+embeddings = TransformerWordEmbeddings('bert-base-uncased', layers='-1', layer_mean=False)
+embeddings.embed(sentence)
+print(sentence[0].embedding.size())
+
+sentence.clear_embeddings()
+
+# use last two layers
+embeddings = TransformerWordEmbeddings('bert-base-uncased', layers='-1,-2', layer_mean=False)
+embeddings.embed(sentence)
+print(sentence[0].embedding.size())
+
+sentence.clear_embeddings()
+
+# use ALL layers
+embeddings = TransformerWordEmbeddings('bert-base-uncased', layers='all', layer_mean=False)
+embeddings.embed(sentence)
+print(sentence[0].embedding.size())
+```
+
+This should print:
+```console
+torch.Size([768])
+torch.Size([1536])
+torch.Size([9984])
+```
+
+I.e. the size of the embedding increases the mode layers we use (but ONLY if layer_mean is set to False, otherwise the length is always the same).
+
+(pooling)=
+### Pooling operation
+
+Most of the Transformer-based models (except Transformer-XL) use subword tokenization. E.g. the following
+token `puppeteer` could be tokenized into the subwords: `pupp`, `##ete` and `##er`.
+
+We implement different pooling operations for these subwords to generate the final token representation:
+
+* `first`: only the embedding of the first subword is used
+* `last`: only the embedding of the last subword is used
+* `first_last`: embeddings of the first and last subwords are concatenated and used
+* `mean`: a `torch.mean` over all subword embeddings is calculated and used
+
+You can choose which one to use by passing this in the constructor:
+
+```python
+# use first and last subtoken for each word
+embeddings = TransformerWordEmbeddings('bert-base-uncased', subtoken_pooling='first_last')
+embeddings.embed(sentence)
+print(sentence[0].embedding.size())
+```
+
+(layer-mean)=
+### Layer mean
+
+The Transformer-based models have a certain number of layers. By default, all layers you select are
+concatenated as explained above. Alternatively, you can set layer_mean=True to do a mean over all
+selected layers. The resulting vector will then always have the same dimensionality as a single layer:
+
+```python
+from flair.embeddings import TransformerWordEmbeddings
+
+# init embedding
+embedding = TransformerWordEmbeddings("roberta-base", layers="all", layer_mean=True)
+
+# create a sentence
+sentence = Sentence("The Oktoberfest is the world's largest Volksfest .")
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+### Fine-tuneable or not
+
+In some setups, you may wish to fine-tune the transformer embeddings. In this case, set `fine_tune=True` in the init method.
+When fine-tuning, you should also only use the topmost layer, so best set `layers='-1'`.
+
+```python
+# use first and last subtoken for each word
+embeddings = TransformerWordEmbeddings('bert-base-uncased', fine_tune=True, layers='-1')
+embeddings.embed(sentence)
+print(sentence[0].embedding)
+```
+
+This will print a tensor that now has a gradient function and can be fine-tuned if you use it in a training routine.
+
+```python
+tensor([-0.0323, -0.3904, -1.1946,  ...,  0.1305, -0.1365, -0.4323],
+       device='cuda:0', grad_fn=<CatBackward>)
+```
+
+### Models
+
+Please have a look at the awesome [Huggingface Model hub](https://huggingface.co/models) to find any open source text embedding model to use.
+
diff --git a/docs/tutorial/tutorial-training/how-model-training-works.md b/docs/tutorial/tutorial-training/how-model-training-works.md
new file mode 100644
index 000000000..1380c22e7
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-model-training-works.md
@@ -0,0 +1,294 @@
+# How model training works in Flair
+
+In this section, we explain the main ideas of model training in Flair.
+
+In particular, we give an introduction to the [`ModelTrainer`](#flair.trainers.ModelTrainer) class, and discuss what decisions you have to make to train good models.
+
+## Example: Training a Part-of-Speech Tagger
+
+As example in this chapter, we train a simple part-of-speech tagger for English. To make the example run fast
+
+- we downsample the training data to 10%
+- we use only simple classic word embeddings (gloVe)
+
+Here is the full training code:
+
+```python
+from flair.datasets import UD_ENGLISH
+from flair.embeddings import WordEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. load the corpus
+corpus = UD_ENGLISH().downsample(0.1)
+print(corpus)
+
+# 2. what label do we want to predict?
+label_type = 'upos'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+print(label_dict)
+
+# 4. initialize embeddings
+embeddings = WordEmbeddings('glove')
+
+# 5. initialize sequence tagger
+model = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type=label_type)
+
+# 6. initialize trainer
+trainer = ModelTrainer(model, corpus)
+
+# 7. start training
+trainer.train('resources/taggers/example-upos',
+              learning_rate=0.1,
+              mini_batch_size=32,
+              max_epochs=10)
+```
+
+This code (1) loads the English universal dependencies dataset as training corpus, (2) create a label dictionary for universal part-of-speech tags from the corpus, (3) initializes embeddings and (4) runs the trainer for 10 epochs.
+
+Running this script should produce output that looks like this during training:
+
+```
+2023-02-27 17:07:38,014 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:38,016 Model training base path: "resources/taggers/example-upos"
+2023-02-27 17:07:38,017 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:38,020 Device: cuda:0
+2023-02-27 17:07:38,022 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:38,023 Embeddings storage mode: cpu
+2023-02-27 17:07:38,025 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:39,128 epoch 1 - iter 4/40 - loss 3.28409882 - time (sec): 1.10 - samples/sec: 2611.84 - lr: 0.100000
+2023-02-27 17:07:39,474 epoch 1 - iter 8/40 - loss 3.13510367 - time (sec): 1.45 - samples/sec: 3143.21 - lr: 0.100000
+2023-02-27 17:07:39,910 epoch 1 - iter 12/40 - loss 3.02619775 - time (sec): 1.88 - samples/sec: 3434.39 - lr: 0.100000
+2023-02-27 17:07:40,167 epoch 1 - iter 16/40 - loss 2.95288554 - time (sec): 2.14 - samples/sec: 3783.76 - lr: 0.100000
+2023-02-27 17:07:40,504 epoch 1 - iter 20/40 - loss 2.86820018 - time (sec): 2.48 - samples/sec: 4171.22 - lr: 0.100000
+2023-02-27 17:07:40,843 epoch 1 - iter 24/40 - loss 2.80507526 - time (sec): 2.82 - samples/sec: 4557.72 - lr: 0.100000
+2023-02-27 17:07:41,118 epoch 1 - iter 28/40 - loss 2.74217397 - time (sec): 3.09 - samples/sec: 4878.00 - lr: 0.100000
+2023-02-27 17:07:41,420 epoch 1 - iter 32/40 - loss 2.69161746 - time (sec): 3.39 - samples/sec: 5072.93 - lr: 0.100000
+2023-02-27 17:07:41,705 epoch 1 - iter 36/40 - loss 2.63837577 - time (sec): 3.68 - samples/sec: 5260.02 - lr: 0.100000
+2023-02-27 17:07:41,972 epoch 1 - iter 40/40 - loss 2.58915523 - time (sec): 3.95 - samples/sec: 5394.33 - lr: 0.100000
+2023-02-27 17:07:41,975 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:41,977 EPOCH 1 done: loss 2.5892 - lr 0.100000
+2023-02-27 17:07:42,567 DEV : loss 2.009714126586914 - f1-score (micro avg)  0.41
+2023-02-27 17:07:42,579 BAD EPOCHS (no improvement): 0
+```
+
+The output monitors the loss over the epochs. At the end of each epoch, the development score is computed and printed.
+
+And a **final evaluation report** gets printed in the end:
+
+```
+Results:
+- F-score (micro) 0.7732
+- F-score (macro) 0.6329
+- Accuracy 0.7732
+
+By class:
+              precision    recall  f1-score   support
+
+        NOUN     0.7199    0.7199    0.7199       407
+       PUNCT     0.9263    0.9843    0.9544       319
+        VERB     0.7521    0.6938    0.7218       258
+        PRON     0.7782    0.9300    0.8474       200
+         ADP     0.8559    0.9515    0.9011       206
+       PROPN     0.6585    0.6398    0.6490       211
+         ADJ     0.5654    0.6914    0.6221       175
+         DET     0.9572    0.8995    0.9275       199
+         AUX     0.8609    0.8784    0.8696       148
+         ADV     0.5052    0.5000    0.5026        98
+       CCONJ     0.9833    0.9077    0.9440        65
+         NUM     0.5435    0.3289    0.4098        76
+        PART     0.9091    0.7143    0.8000        56
+       SCONJ     0.7083    0.5667    0.6296        30
+         SYM     0.3333    0.2143    0.2609        14
+           X     0.0000    0.0000    0.0000        15
+        INTJ     0.0000    0.0000    0.0000        14
+
+    accuracy                         0.7732      2491
+   macro avg     0.6504    0.6247    0.6329      2491
+weighted avg     0.7635    0.7732    0.7655      2491
+```
+
+This report gives us a breakdown of the precision, recall and F1 score of all classes, as well as overall.
+
+Congrats, you just trained your first model!
+
+
+## Step-by-step walkthrough
+
+The above code showed you how to train a part-of-speech tagger.
+
+Now let's individually look at each of the main steps in the above script:
+
+### Step 1: Load a Corpus
+
+The first thing you need is data to train and evaluate your model on.
+
+In Flair, training is done using the [`Corpus`](#flair.data.Corpus) object that holds three "splits": a `train`, a `dev` and a `test` split.
+
+```{note}
+
+Splitting your data into three splits is standard procedure in machine learning: the `train` split is used to train the model while the `dev` split is used for model selection and early stopping. The `test` split is used only for the final evaluation.
+```
+
+In this example, we use the [English Universal Dependencies](https://universaldependencies.org/treebanks/en_ewt/index.html) dataset to train on. It contains many sentences fully annotated with both universal and language-specific part-of-speech tags. Running these lines will load and print the corpus:
+
+```python
+from flair.datasets import UD_ENGLISH
+
+# 1. load the corpus
+corpus = UD_ENGLISH().downsample(0.1)
+print(corpus)
+```
+
+which should print:
+
+```
+Corpus: 1254 train + 200 dev + 208 test sentences
+```
+
+Showing us that our downsampled training data has three splits: a training split of 1254 sentences, a dev split of 200 sentences, and a test split of 208 sentences.
+
+```{note}
+The [`Corpus`](#flair.data.Corpus) object has a number of very handy helper functions that let you manipulate the data and compute statistics. For instance, in the code above we called [`Corpus.downsample(0.1)`](#flair.data.Corpus.downsample) to downsample the corpus to 10% of its original size. To learn about more helper functions, check out the [corpus tutorial](how-to-load-prepared-dataset.md).
+```
+
+### Step 2: Choose the label type
+
+After you load the corpus, you need to decide which label type to predict.
+
+We choose the label type **'upos'**, since we want to predict universal part-of-speech tags in this example.
+
+```python
+# 2. what label do we want to predict?
+label_type = 'upos'
+```
+
+```{note}
+You might ask: why is specifying the `label_type` even necessary? Well, some corpora have more than one label type. The English UD treebank for instance has both universal PoS tags ('upos') and regular PoS tags ('pos'), plus many other layers of annotation. A tagger is normally trained to predict just type of annotation.
+
+This means that you need to know which label types a specific corpus has labels for, and choose one of them.
+```
+
+
+### Step 3: Creating a label dictionary
+
+Our model needs to predict a set of labels. To determine the label set, run [`Corpus.make_label_dictionary()`](#flair.data.Corpus.make_label_dictionary) on the corpus and pass the label type you want to predict. In this example, we pass **'upos'** since we want to predict universal part-of-speech tags.
+
+Running these lines will compute and print the label dictionary from the corpus:
+
+```python
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+print(label_dict)
+```
+
+which should print:
+
+```
+Dictionary with 18 tags: <unk>, NOUN, PUNCT, VERB, PRON, ADP, DET, AUX, ADJ, PROPN, ADV, CCONJ, PART, SCONJ, NUM, X, SYM, INTJ
+```
+
+Showing us that our label dictionary has 18 PoS tags, including one generic tag (`<unk>`) for all unknown labels.
+
+### Step 4: Initialize embeddings
+
+All models in Flair require you to choose embeddings. In most cases, you'll want transformer embeddings. Choosing the right embeddings and parameters is crucial in order to train good models.
+
+In our example, we use simple GloVe embeddings:
+
+
+```python
+# 4. initialize embeddings
+embeddings = WordEmbeddings('glove')
+```
+
+But this is only to make the example code run fast. We generally advise to use transformer-based embeddings instead.
+
+### Step 5: Initialize the Model
+
+Depending on what you want to do, you need to initialize the appropriate model type.
+
+For this example, we use the [`SequenceTagger`](#flair.models.SequenceTagger) since we do part-of-speech tagging:
+
+```python
+# 5. initialize sequence tagger
+model = SequenceTagger(hidden_size=256,
+                       embeddings=embeddings,
+                       tag_dictionary=label_dict,
+                       tag_type=label_type)
+```
+
+Printing it will give you the PyTorch model that is initialized.
+
+```{note}
+Depending on the task, you need a different model type: For sequence labeling (NER, part-of-speech tagging) you need the [`SequenceTagger`](#flair.models.SequenceTagger). For text classification you need the [`TextClassifier`](#flair.models.TextClassifier).
+
+For each model type, we are creating dedicated tutorials to better explain what they do.
+```
+
+### Step 6: Initialize the Trainer
+
+The [`ModelTrainer`](#flair.trainers.ModelTrainer) is initialized simply by passing the model and the corpus because that is all it needs.
+
+```python
+# 6. initialize trainer
+trainer = ModelTrainer(model, corpus)
+```
+
+### Step 7: Train
+
+Once the trainer is initialized, you can call [`ModelTrainer.train()`](#flair.trainers.ModelTrainer.train) to launch a standard training run.
+
+```python
+# 7. start training
+trainer.train('resources/taggers/example-upos',
+              learning_rate=0.1,
+              mini_batch_size=32,
+              max_epochs=10)
+```
+
+This will launch a "standard training run" with SGD as optimizer. By default, the learning rate is annealed against the development score: if fo 3 epochs there is no improvement on the dev split, the learning rate is halved. If this happens too often, the learning rate will fall below a minimal threshold and training stops early.
+
+The max_epochs parameter is set to a small number in this script to make it run fast, but normally you should use a much higher value (150 or 200).
+
+```{note}
+There are two main mechanisms to train a model in Flair. (1) The "classic" workflow (SGD with annealing) is invoked as above using the [`ModelTrainer.train()`](#flair.trainers.ModelTrainer.train) method. (2) The current state-of-the-art based on fine-tuning (AdamW with Linear Learning Rate Schedulle) is invoked using the [`ModelTrainer.fine_tune()`](#flair.trainers.ModelTrainer.fine_tune) method. In most cases, you will want to use the latter.
+```
+
+### Step 8: Predict
+
+Once the model is trained you can use it to predict tags for new sentences. Just call the [`.predict()`](#flair.nn.Classifier.predict) method of the model.
+
+```python
+# load the model you trained
+model = SequenceTagger.load('resources/taggers/example-upos/final-model.pt')
+
+# create example sentence
+sentence = Sentence('I love Berlin')
+
+# predict tags and print
+model.predict(sentence)
+
+print(sentence.to_tagged_string())
+```
+
+If the model works well, it will correctly tag 'love' as a verb in this example.
+
+## Summary
+
+This tutorial gave you a general overview of the main steps to train a model:
+
+-    load a corpus
+-    choose a label type
+-    create a label dictionary
+-    choose embeddings
+-    initialize model
+-    initialize trainer
+-    train
+
+
diff --git a/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md b/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md
new file mode 100644
index 000000000..1e7fadb0f
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md
@@ -0,0 +1,161 @@
+# How to load a custom dataset
+
+This part of the tutorial shows how you can load a corpus for training a model. 
+
+## loading a ColumnCorpus
+
+In cases you want to train over a sequence labeling dataset that is not in the above list, you can load them with the [`ColumnCorpus`](#flair.datasets.sequence_labeling.ColumnCorpus) object.
+Most sequence labeling datasets in NLP use some sort of column format in which each line is a word and each column is
+one level of linguistic annotation. See for instance this sentence:
+
+```console
+George N B-PER
+Washington N I-PER
+went V O
+to P O
+Washington N B-LOC
+
+Sam N B-PER
+Houston N I-PER
+stayed V O
+home N O
+```
+
+The first column is the word itself, the second coarse PoS tags, and the third BIO-annotated NER tags. Empty line separates sentences. To read such a
+dataset, define the column structure as a dictionary and instantiate a [`ColumnCorpus`](#flair.datasets.sequence_labeling.ColumnCorpus).
+
+```python
+from flair.data import Corpus
+from flair.datasets import ColumnCorpus
+
+# define columns
+columns = {0: 'text', 1: 'pos', 2: 'ner'}
+
+# this is the folder in which train, test and dev files reside
+data_folder = '/path/to/data/folder'
+
+# init a corpus using column format, data folder and the names of the train, dev and test files
+corpus: Corpus = ColumnCorpus(data_folder, columns,
+                              train_file='train.txt',
+                              test_file='test.txt',
+                              dev_file='dev.txt')
+
+```
+
+This gives you a [`Corpus`](#flair.data.Corpus) object that contains the train, dev and test splits, each has a list of [`Sentence`](#flair.data.Sentence).
+So, to check how many sentences there are in the training split, do
+
+```python
+len(corpus.train)
+```
+
+You can also access a sentence and check out annotations. Lets assume that the training split is
+read from the example above, then executing these commands
+
+```python
+print(corpus.train[0].to_tagged_string('ner'))
+print(corpus.train[1].to_tagged_string('pos'))
+```
+
+will print the sentences with different layers of annotation:
+
+```console
+George <B-PER> Washington <I-PER> went to Washington <B-LOC> .
+
+Sam <N> Houston <N> stayed <V> home <N>
+```
+
+## Reading a text classification dataset
+
+If you want to use your own text classification dataset, there are currently two methods to go about this:
+load specified text and labels from a simple CSV file or format your data to the
+[FastText format](https://fasttext.cc/docs/en/supervised-tutorial.html).
+
+### Load from simple CSV file
+
+Many text classification datasets are distributed as simple CSV files in which each row corresponds to a data point and
+columns correspond to text, labels, and other metadata.  You can load a CSV format classification dataset using
+[`CSVClassificationCorpus`](#flair.datasets.document_classification.CSVClassificationCorpus) by passing in a column format (like in [`ColumnCorpus`](#flair.datasets.sequence_labeling.ColumnCorpus) above).  This column format indicates
+which column(s) in the CSV holds the text and which field(s) the label(s). By default, Python's CSV library assumes that
+your files are in Excel CSV format, but [you can specify additional parameters](https://docs.python.org/3/library/csv.html#csv-fmt-params)
+if you use custom delimiters or quote characters.
+
+Note: You will need to save your split CSV data files in the `data_folder` path with each file titled appropriately i.e.
+`train.csv` `test.csv` `dev.csv`.   This is because the corpus initializers will automatically search for the train,
+dev, test splits in a folder.
+
+```python
+from flair.data import Corpus
+from flair.datasets import CSVClassificationCorpus
+
+# this is the folder in which train, test and dev files reside
+data_folder = '/path/to/data'
+
+# column format indicating which columns hold the text and label(s)
+column_name_map = {4: "text", 1: "label_topic", 2: "label_subtopic"}
+
+# load corpus containing training, test and dev data and if CSV has a header, you can skip it
+corpus: Corpus = CSVClassificationCorpus(data_folder,
+                                         column_name_map,
+                                         skip_header=True,
+                                         delimiter='\t',    # tab-separated files
+)
+```
+
+
+### FastText format
+If using [`CSVClassificationCorpus`](#flair.datasets.document_classification.CSVClassificationCorpus) is not practical, you may format your data to the FastText format, in which each line in the file represents a text document. A document can have one or multiple labels that are defined at the beginning of the line starting with the prefix `__label__`. This looks like this:
+
+```bash
+__label__<label_1> <text>
+__label__<label_1> __label__<label_2> <text>
+```
+
+As previously mentioned, to create a [`Corpus`](#flair.data.Corpus) for a text classification task, you need to have three files (train, dev, and test) in the
+above format located in one folder. This data folder structure could, for example, look like this for the IMDB task:
+```text
+/resources/tasks/imdb/train.txt
+/resources/tasks/imdb/dev.txt
+/resources/tasks/imdb/test.txt
+```
+Now create a [`CSVClassificationCorpus`](#flair.datasets.document_classification.CSVClassificationCorpus) by pointing to this folder (`/resources/tasks/imdb`).
+Thereby, each line in a file is converted to a [`Sentence`](#flair.data.Sentence) object annotated with the labels.
+
+```{important}
+A text in a line can have multiple sentences. Thus, a [`Sentence`](#flair.data.Sentence) object can actually consist of multiple sentences.
+```
+
+```python
+from flair.data import Corpus
+from flair.datasets import ClassificationCorpus
+
+# this is the folder in which train, test and dev files reside
+data_folder = '/path/to/data/folder'
+
+# load corpus containing training, test and dev data
+corpus: Corpus = ClassificationCorpus(data_folder,
+                                      test_file='test.txt',
+                                      dev_file='dev.txt',
+                                      train_file='train.txt',
+                                      label_type='topic',
+                                      )
+```
+
+Note again that our corpus initializers have methods to automatically look for train, dev and test splits in a folder. So in
+most cases you don't need to specify the file names yourself. Often, this is enough:
+
+```python
+# this is the folder in which train, test and dev files reside
+data_folder = '/path/to/data/folder'
+
+# load corpus by pointing to folder. Train, dev and test gets identified automatically.
+corpus: Corpus = ClassificationCorpus(data_folder,
+                                      label_type='topic',
+                                      )
+```
+
+Since the FastText format does not have columns, you must manually define a name for the annotations. In this
+example we chose `label_type='topic'` to denote that we are loading a corpus with topic labels.
+
+
+
diff --git a/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md b/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md
new file mode 100644
index 000000000..ed29bea50
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md
@@ -0,0 +1,195 @@
+# How to load a prepared dataset
+
+This part of the tutorial shows how you can load a corpus for training a model. 
+
+## The Corpus Object
+
+The [`Corpus`](#flair.data.Corpus) represents a dataset that you use to train a model. It consists of a list of `train` sentences,
+a list of `dev` sentences, and a list of `test` sentences, which correspond to the training, validation and testing
+split during model training.
+
+The following example snippet instantiates the Universal Dependency Treebank for English as a corpus object:
+
+```python
+from flair.datasets import UD_ENGLISH
+corpus = UD_ENGLISH()
+```
+
+The first time you call this snippet, it triggers a download of the Universal Dependency Treebank for English onto your
+hard drive. It then reads the train, test and dev splits into the [`Corpus`](#flair.data.Corpus) which it returns. Check the length of
+the three splits to see how many Sentences are there:
+
+```python
+# print the number of Sentences in the train split
+print(len(corpus.train))
+
+# print the number of Sentences in the test split
+print(len(corpus.test))
+
+# print the number of Sentences in the dev split
+print(len(corpus.dev))
+```
+
+You can also access the [`Sentence`](#flair.data.Sentence) objects in each split directly. For instance, let us look at the first Sentence in
+the training split of the English UD:
+
+```python
+# get the first Sentence in the training split
+sentence = corpus.test[0]
+
+# print with all annotations
+print(sentence)
+
+# print only with POS annotations (better readability)
+print(sentence.to_tagged_string('pos'))
+```
+
+The sentence is fully tagged with syntactic and morphological information. With the latter line,
+you print out only the POS tags:
+
+```console
+Sentence: "What if Google Morphed Into GoogleOS ?" → ["What"/WP, "if"/IN, "Google"/NNP, "Morphed"/VBD, "Into"/IN, "GoogleOS"/NNP, "?"/.]
+```
+
+So the corpus is tagged and ready for training.
+
+### Helper functions
+
+A [`Corpus`](#flair.data.Corpus) contains a bunch of useful helper functions.
+For instance, you can downsample the data by calling [`Corpus.downsample()`](#flair.data.Corpus.downsample) and passing a ratio. So, if you normally get a
+corpus like this:
+
+```python
+from flair.datasets import UD_ENGLISH
+corpus = UD_ENGLISH()
+```
+
+then you can downsample the corpus, simply like this:
+
+```python
+from flair.datasets import UD_ENGLISH
+downsampled_corpus = UD_ENGLISH().downsample(0.1)
+```
+
+If you print both corpora, you see that the second one has been downsampled to 10% of the data.
+
+```python
+print("--- 1 Original ---")
+print(corpus)
+
+print("--- 2 Downsampled ---")
+print(downsampled_corpus)
+```
+
+This should print:
+
+```console
+--- 1 Original ---
+Corpus: 12543 train + 2002 dev + 2077 test sentences
+
+--- 2 Downsampled ---
+Corpus: 1255 train + 201 dev + 208 test sentences
+```
+
+### Creating label dictionaries
+
+For many learning tasks you need to create a "dictionary" that contains all the labels you want to predict.
+You can generate this dictionary directly out of the [`Corpus`](#flair.data.Corpus) by calling the method [`Corpus.make_label_dictionary`](#flair.data.Corpus.make_label_dictionary)
+and passing the desired `label_type`.
+
+For instance, the UD_ENGLISH corpus instantiated above has multiple layers of annotation like regular
+POS tags ('pos'), universal POS tags ('upos'), morphological tags ('tense', 'number'..) and so on.
+Create label dictionaries for universal POS tags by passing `label_type='upos'` like this:
+
+```python
+# create label dictionary for a Universal Part-of-Speech tagging task
+upos_dictionary = corpus.make_label_dictionary(label_type='upos')
+
+# print dictionary
+print(upos_dictionary)
+```
+
+This will print out the created dictionary:
+
+```console
+Dictionary with 17 tags: PROPN, PUNCT, ADJ, NOUN, VERB, DET, ADP, AUX, PRON, PART, SCONJ, NUM, ADV, CCONJ, X, INTJ, SYM
+```
+
+#### Dictionaries for other label types
+
+If you don't know the label types in a corpus, just call [`Corpus.make_label_dictionary`](#flair.data.Corpus.make_label_dictionary) with
+any random label name (e.g. `corpus.make_label_dictionary(label_type='abcd')`). This will print
+out statistics on all label types in the corpus:
+
+```console
+The corpus contains the following label types: 'lemma' (in 12543 sentences), 'upos' (in 12543 sentences), 'pos' (in 12543 sentences), 'dependency' (in 12543 sentences), 'number' (in 12036 sentences), 'verbform' (in 10122 sentences), 'prontype' (in 9744 sentences), 'person' (in 9381 sentences), 'mood' (in 8911 sentences), 'tense' (in 8747 sentences), 'degree' (in 7148 sentences), 'definite' (in 6851 sentences), 'case' (in 6486 sentences), 'gender' (in 2824 sentences), 'numtype' (in 2771 sentences), 'poss' (in 2516 sentences), 'voice' (in 1085 sentences), 'typo' (in 399 sentences), 'extpos' (in 185 sentences), 'abbr' (in 168 sentences), 'reflex' (in 98 sentences), 'style' (in 31 sentences), 'foreign' (in 5 sentences)
+```
+
+This means that you can create dictionaries for any of these label types for the [`UD_ENGLISH`](#flair.datasets.treebanks.UD_ENGLISH) corpus. Let's create dictionaries for regular part of speech tags
+and a morphological number tagging task:
+
+```python
+# create label dictionary for a regular POS tagging task
+pos_dictionary = corpus.make_label_dictionary(label_type='pos')
+
+# create label dictionary for a morphological number tagging task
+tense_dictionary = corpus.make_label_dictionary(label_type='number')
+```
+
+If you print these dictionaries, you will find that the POS dictionary contains 50 tags and the number dictionary only 2 for this corpus (singular and plural).
+
+
+#### Dictionaries for other corpora types
+
+The method [`Corpus.make_label_dictionary`](#flair.data.Corpus.make_label_dictionary) can be used for any corpus, including text classification corpora:
+
+```python
+# create label dictionary for a text classification task
+from flair.datasets import TREC_6
+corpus = TREC_6()
+corpus.make_label_dictionary('question_class')
+```
+
+### The MultiCorpus Object
+
+If you want to train multiple tasks at once, you can use the [`MultiCorpus`](#flair.data.MultiCorpus) object.
+To initiate the [`MultiCorpus`](#flair.data.MultiCorpus) you first need to create any number of [`Corpus`](#flair.data.Corpus) objects. Afterwards, you can pass
+a list of [`Corpus`](#flair.data.Corpus) to the [`MultiCorpus`](#flair.data.MultiCorpus) object. For instance, the following snippet loads a combination corpus
+consisting of the English, German and Dutch Universal Dependency Treebanks.
+
+```python
+from flair.datasets import UD_ENGLISH, UD_GERMAN, UD_DUTCH
+english_corpus = UD_ENGLISH()
+german_corpus = UD_GERMAN()
+dutch_corpus = UD_DUTCH()
+
+# make a multi corpus consisting of three UDs
+from flair.data import MultiCorpus
+multi_corpus = MultiCorpus([english_corpus, german_corpus, dutch_corpus])
+```
+
+The [`MultiCorpus`](#flair.data.MultiCorpus) inherits from `[`Corpus`](#flair.data.Corpus), so you can use it like any other corpus to train your models.
+
+## Datasets included in Flair
+
+Flair supports many datasets out of the box. It usually automatically downloads and sets up the data the first time you
+call the corresponding constructor ID.
+The datasets are split into multiple modules, however they all can be imported from `flair.datasets` too.
+You can look up the respective modules to find the possible datasets.
+
+The following datasets are supported:
+
+| Task                                | Module                                                                                                                                      |
+|-------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------|
+| Named Entity Recognition            | [flair.datasets.sequence_labeling](#flair.datasets.sequence_labeling)                                                                       |
+| Text Classification                 | [flair.datasets.document_classification](#flair.datasets.document_classification)                                                           |
+| Text Regression                     | [flair.datasets.document_classification](#flair.datasets.document_classification)                                                           |
+| Biomedical Named Entity Recognition | [flair.datasets.biomedical](#flair.datasets.biomedical)                                                                                     |
+| Entity Linking                      | [flair.datasets.entity_linking](#flair.datasets.entity_linking)                                                                             |
+| Relation Extraction                 | [flair.datasets.relation_extraction](#flair.datasets.relation_extraction)                                                                   |
+| Sequence Labeling                   | [flair.datasets.sequence_labeling](#flair.datasets.sequence_labeling)                                                                       |
+| Glue Benchmark                      | [flair.datasets.text_text](#flair.datasets.text_text) and [flair.datasets.document_classification](#flair.datasets.document_classification) |
+| Universal Proposition Banks         | [flair.datasets.treebanks](#flair.datasets.treebanks)                                                                                       |
+| Universal Dependency Treebanks      | [flair.datasets.treebanks](#flair.datasets.treebanks)                                                                                       |
+| OCR-Layout-NER                      | [flair.datasets.ocr](#flair.datasets.ocr)                                                                                                   |
+
diff --git a/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md b/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md
new file mode 100644
index 000000000..247b3daa1
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md
@@ -0,0 +1,225 @@
+# Train a sequence tagger
+
+Sequence labeling models are used to model problems such as named entity recognition (NER) and
+part-of-speech (PoS) tagging.
+
+This tutorial section show you how to train state-of-the-art NER models and other taggers in Flair.
+
+## Training a named entity recognition (NER) model with transformers
+
+For a state-of-the-art NER sytem you should fine-tune transformer embeddings, and use full document context
+(see our [FLERT](https://arxiv.org/abs/2011.06993) paper for details). 
+
+Use the following script:
+
+```python
+from flair.datasets import CONLL_03
+from flair.embeddings import TransformerWordEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. get the corpus
+corpus = CONLL_03()
+print(corpus)
+
+# 2. what label do we want to predict?
+label_type = 'ner'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
+print(label_dict)
+
+# 4. initialize fine-tuneable transformer embeddings WITH document context
+embeddings = TransformerWordEmbeddings(model='xlm-roberta-large',
+                                       layers="-1",
+                                       subtoken_pooling="first",
+                                       fine_tune=True,
+                                       use_context=True,
+                                       )
+
+# 5. initialize bare-bones sequence tagger (no CRF, no RNN, no reprojection)
+tagger = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type='ner',
+                        use_crf=False,
+                        use_rnn=False,
+                        reproject_embeddings=False,
+                        )
+
+# 6. initialize trainer
+trainer = ModelTrainer(tagger, corpus)
+
+# 7. run fine-tuning
+trainer.fine_tune('resources/taggers/sota-ner-flert',
+                  learning_rate=5.0e-6,
+                  mini_batch_size=4,
+                  mini_batch_chunk_size=1,  # remove this parameter to speed up computation if you have a big GPU
+                  )
+```
+
+As you can see, we use [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) based on 'xlm-roberta-large' embeddings. We enable fine-tuning and set `use_context` to True. 
+We also deactivate the RNN, CRF and reprojection in the [`SequenceTagger`](#flair.models.SequenceTagger). This is because the 
+transformer is so powerful that it does not need these components. We then fine-tune the model with a very small
+learning rate on the corpus.
+
+This will give you state-of-the-art numbers similar to the ones reported
+in [Schweter and Akbik (2021)](https://arxiv.org/abs/2011.06993). 
+
+
+## Training a named entity recognition (NER) model with Flair embeddings
+
+Alternatively to fine-tuning a very large transformer, you can use a classic training setup without fine-tuning.
+In the classic setup, you learn a LSTM-CRF on top of frozen embeddings. We typically use a 'stack' that combines
+Flair and GloVe embeddings:
+
+```python
+from flair.datasets import CONLL_03
+from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. get the corpus
+corpus = CONLL_03()
+print(corpus)
+
+# 2. what label do we want to predict?
+label_type = 'ner'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
+print(label_dict)
+
+# 4. initialize embedding stack with Flair and GloVe
+embedding_types = [
+    WordEmbeddings('glove'),
+    FlairEmbeddings('news-forward'),
+    FlairEmbeddings('news-backward'),
+]
+
+embeddings = StackedEmbeddings(embeddings=embedding_types)
+
+# 5. initialize sequence tagger
+tagger = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type=label_type)
+
+# 6. initialize trainer
+trainer = ModelTrainer(tagger, corpus)
+
+# 7. start training
+trainer.train('resources/taggers/sota-ner-flair',
+              learning_rate=0.1,
+              mini_batch_size=32,
+              max_epochs=150)
+```
+
+This will give you state-of-the-art numbers similar to the ones reported in [Akbik et al. (2018)](https://aclanthology.org/C18-1139.pdf).
+The numbers are not quite as high as fine-tuning transformers, but it requires less GPU memory and depending on your
+setup may run faster in the end. 
+
+
+## Training a part-of-speech tagger
+
+If you want to train a part-of-speech model instead of NER, simply exchange the corpus and the label type: 
+
+```python
+from flair.datasets import UD_ENGLISH
+from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. get the corpus
+corpus = UD_ENGLISH()
+print(corpus)
+
+# 2. what label do we want to predict?
+label_type = 'upos'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+print(label_dict)
+
+# 4. initialize embeddings
+embedding_types = [
+    WordEmbeddings('glove'),
+    FlairEmbeddings('news-forward'),
+    FlairEmbeddings('news-backward'),
+]
+
+embeddings = StackedEmbeddings(embeddings=embedding_types)
+
+# 5. initialize sequence tagger
+tagger = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type=label_type,
+                        use_crf=True)
+
+# 6. initialize trainer
+trainer = ModelTrainer(tagger, corpus)
+
+# 7. start training
+trainer.train('resources/taggers/example-upos',
+              learning_rate=0.1,
+              mini_batch_size=32)
+```
+
+This script will give you the state-of-the-art accuracy reported in [Akbik et al. (2018)](https://aclanthology.org/C18-1139.pdf).
+
+## Multi-dataset training
+
+Now, let us train a single model that can PoS tag text in both English and German. To do this, we load both the English
+and German UD corpora and create a [`MultiCorpus`](#flair.data.MultiCorpus) object. We also use the new multilingual Flair embeddings for this task.
+
+All the rest is same as before, e.g.:
+
+```python
+from flair.data import MultiCorpus
+from flair.datasets import UD_ENGLISH, UD_GERMAN
+from flair.embeddings import FlairEmbeddings, StackedEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. get the corpora - English and German UD
+corpus = MultiCorpus([UD_ENGLISH(), UD_GERMAN()]).downsample(0.1)
+
+# 2. what label do we want to predict?
+label_type = 'upos'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+print(label_dict)
+
+# 4. initialize embeddings
+embedding_types = [
+
+    # we use multilingual Flair embeddings in this task
+    FlairEmbeddings('multi-forward'),
+    FlairEmbeddings('multi-backward'),
+]
+
+embeddings = StackedEmbeddings(embeddings=embedding_types)
+
+# 5. initialize sequence tagger
+tagger = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type=label_type,
+                        use_crf=True)
+
+# 6. initialize trainer
+trainer = ModelTrainer(tagger, corpus)
+
+# 7. start training
+trainer.train('resources/taggers/example-universal-pos',
+              learning_rate=0.1,
+              mini_batch_size=32,
+              max_epochs=150,
+              )
+```
+
+This gives you a multilingual model. Try experimenting with more languages!
+
+
diff --git a/docs/tutorial/tutorial-training/how-to-train-text-classifier.md b/docs/tutorial/tutorial-training/how-to-train-text-classifier.md
new file mode 100644
index 000000000..265689c21
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-to-train-text-classifier.md
@@ -0,0 +1,60 @@
+# Train a text classifier
+
+This tutorial shows you how to train your own text classifier models with Flair. For instance, you 
+could train your own sentiment analysis model, or offensive language detection model.
+
+
+## Training a text classification model with transformers
+
+For text classification, you reach state-of-the-art scores by fine-tuning a transformer. 
+
+Training a model is easy: load the appropriate corpus, make a label dictionary, then fine-tune a [`TextClassifier`](#flair.models.TextClassifier)
+model using the [`ModelTrainer.fine_tune()`](#flair.trainers.ModelTrainer.fine_tune) method. See the example script below:
+
+```python
+from flair.data import Corpus
+from flair.datasets import TREC_6
+from flair.embeddings import TransformerDocumentEmbeddings
+from flair.models import TextClassifier
+from flair.trainers import ModelTrainer
+
+# 1. get the corpus
+corpus: Corpus = TREC_6()
+
+# 2. what label do we want to predict?
+label_type = 'question_class'
+
+# 3. create the label dictionary
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+
+# 4. initialize transformer document embeddings (many models are available)
+document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)
+
+# 5. create the text classifier
+classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)
+
+# 6. initialize trainer
+trainer = ModelTrainer(classifier, corpus)
+
+# 7. run training with fine-tuning
+trainer.fine_tune('resources/taggers/question-classification-with-transformer',
+                  learning_rate=5.0e-5,
+                  mini_batch_size=4,
+                  max_epochs=10,
+                  )
+```
+
+Once the model is trained you can load it to predict the class of new sentences. Just call the [`predict`](#flair.nn.DefaultClassifier.predict) method of the model.
+
+```python
+classifier = TextClassifier.load('resources/taggers/question-classification-with-transformer/final-model.pt')
+
+# create example sentence
+sentence = Sentence('Who built the Eiffel Tower ?')
+
+# predict class and print
+classifier.predict(sentence)
+
+print(sentence.labels)
+```
+
diff --git a/docs/tutorial/tutorial-training/index.rst b/docs/tutorial/tutorial-training/index.rst
new file mode 100644
index 000000000..70209a3f7
--- /dev/null
+++ b/docs/tutorial/tutorial-training/index.rst
@@ -0,0 +1,15 @@
+Tutorial 2: Training models
+===========================
+
+This tutorial illustrates how you can train your own state-of-the-art NLP models with Flair.
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+
+   how-model-training-works
+   train-vs-fine-tune
+   how-to-load-prepared-dataset
+   how-to-load-custom-dataset
+   how-to-train-sequence-tagger
+   how-to-train-text-classifier
diff --git a/docs/tutorial/tutorial-training/train-vs-fine-tune.md b/docs/tutorial/tutorial-training/train-vs-fine-tune.md
new file mode 100644
index 000000000..fd45e90ea
--- /dev/null
+++ b/docs/tutorial/tutorial-training/train-vs-fine-tune.md
@@ -0,0 +1,11 @@
+# Training vs fine-tuning
+
+There are two broad ways you train a model: The "classic" approach and the fine-tuning approach. This section
+explains the differences, and the things you need to do. 
+
+
+## Fine-Tuning
+
+
+## Training
+
diff --git a/flair/__init__.py b/flair/__init__.py
index 2eb42075f..46550af6b 100644
--- a/flair/__init__.py
+++ b/flair/__init__.py
@@ -9,9 +9,18 @@
 from .file_utils import set_proxies
 
 cache_root = Path(os.getenv("FLAIR_CACHE_ROOT", Path(Path.home(), ".flair")))
+"""The path to the cache folder Flair is using.
+
+This value defaults to `<Home Directory>/.flair`.
+You can choose the path by setting the `FLAIR_CACHE_ROOT` environment variable.
+"""
 
 device: torch.device
-"""Flair is using a single device for everything. You can set this device by overwriting this variable."""
+"""Flair is using a single device for everything. You can set this device by overwriting this variable.
+
+This value will be automatically set to the first found GPU if available and to CPU otherwise.
+You can choose a specific GPU, by setting the `FLAIR_DEVICE` environment variable to its index.
+"""
 
 
 # global variable: device
@@ -25,6 +34,7 @@
 
 # global variable: version
 __version__ = "0.12.2"
+"""The current version of the flair library installed."""
 
 # global variable: arrow symbol
 _arrow = " → "
@@ -55,9 +65,21 @@
 )
 
 logger = logging.getLogger("flair")
+"""The logger used by Flair.
+
+You can reconfigure it to change the log output to your likings.
+"""
 
 
 def set_seed(seed: int):
+    """Set the seed for all random generators used in training.
+
+    Use this method to guarantee reproducibility of experiments.
+
+    Args:
+        seed: any value you want
+
+    """
     hf_set_seed(seed)
 
 
diff --git a/flair/data.py b/flair/data.py
index 9ca6da2cc..24d8cef05 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -9,7 +9,7 @@
 from typing import Dict, Iterable, List, NamedTuple, Optional, Union, cast
 
 import torch
-from deprecated import deprecated
+from deprecated.sphinx import deprecated
 from torch.utils.data import Dataset, IterableDataset
 from torch.utils.data.dataset import ConcatDataset, Subset
 
@@ -69,8 +69,10 @@ def remove_item(self, item: str):
     def add_item(self, item: str) -> int:
         """Add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID.
 
-        :param item: a string for which to assign an id.
-        :return: ID of string
+        Args:
+            item: a string for which to assign an id.
+
+        Returns: ID of string
         """
         bytes_item = item.encode("utf-8")
         if bytes_item not in self.item2idx:
@@ -81,8 +83,10 @@ def add_item(self, item: str) -> int:
     def get_idx_for_item(self, item: str) -> int:
         """Returns the ID of the string, otherwise 0.
 
-        :param item: string for which ID is requested
-        :return: ID of string, otherwise 0
+        Args:
+            item: string for which ID is requested
+
+        Returns: ID of string, otherwise 0
         """
         item_encoded = item.encode("utf-8")
         if item_encoded in self.item2idx:
@@ -99,8 +103,10 @@ def get_idx_for_item(self, item: str) -> int:
     def get_idx_for_items(self, items: List[str]) -> List[int]:
         """Returns the IDs for each item of the list of string, otherwise 0 if not found.
 
-        :param items: List of string for which IDs are requested
-        :return: List of ID of strings
+        Args:
+            items: List of string for which IDs are requested
+
+        Returns: List of ID of strings
         """
         if not hasattr(self, "item2idx_not_encoded"):
             d = {key.decode("UTF-8"): value for key, value in self.item2idx.items()}
@@ -706,15 +712,17 @@ def __init__(
     ) -> None:
         """Class to hold all metadata related to a text.
 
-        Metadata can be tokens, predictions, language code, ...
-        :param text: original string (sentence), or a list of string tokens (words)
-        :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`)
-            more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer`
-            to use Spacy library if available). Check the implementations of abstract class Tokenizer or
-            implement your own subclass (if you need it). If instead of providing a Tokenizer, this parameter
-            is just set to True (deprecated), :class:`SegtokTokenizer` will be used.
-        :param language_code: Language of the sentence
-        :param start_position: Start char offset of the sentence in the superordinate document
+        Metadata can be tokens, labels, predictions, language code, etc.
+
+        Args:
+            text: original string (sentence), or a pre tokenized list of tokens.
+            use_tokenizer: Specify a custom tokenizer to split the text into tokens. The Default is
+                :class:`flair.tokenization.SegTokTokenizer`. If `use_tokenizer` is set to False,
+                :class:`flair.tokenization.SpaceTokenizer` will be used instead. The tokenizer will be ignored,
+                if `text` refers to pretokenized tokens.
+            language_code: Language of the sentence. If not provided, [langdetect](https://pypi.org/project/langdetect/)
+                will be called when the language_code is accessed for the first time.
+            start_position: Start char offset of the sentence in the superordinate document.
         """
         super().__init__()
 
@@ -1354,11 +1362,14 @@ def make_vocab_dictionary(self, max_tokens=-1, min_freq=1) -> Dictionary:
 
         By defining `max_tokens` you can set the maximum number of tokens that should be contained in the dictionary.
         If there are more than `max_tokens` tokens in the corpus, the most frequent tokens are added first.
-        If `min_freq` is set the a value greater than 1 only tokens occurring more than `min_freq` times are considered
+        If `min_freq` is set to a value greater than 1 only tokens occurring more than `min_freq` times are considered
         to be added to the dictionary.
-        :param max_tokens: the maximum number of tokens that should be added to the dictionary (-1 = take all tokens)
-        :param min_freq: a token needs to occur at least `min_freq` times to be added to the dictionary (-1 = there is no limitation)
-        :return: dictionary of tokens
+
+        Args:
+            max_tokens: the maximum number of tokens that should be added to the dictionary (-1 = take all tokens)
+            min_freq: a token needs to occur at least `min_freq` times to be added to the dictionary (-1 = there is no limitation)
+
+        Returns: dictionary of tokens
         """
         tokens = self._get_most_common_tokens(max_tokens, min_freq)
 
@@ -1563,12 +1574,13 @@ def add_label_noise(
     ):
         """Generates uniform label noise distribution in the chosen dataset split.
 
-        :label_type: the type of labels for which the noise should be simulated.
-        :labels: an array with unique labels of said type (retrievable from label dictionary).
-        :noise_share: the desired share of noise in the train split.
-        :split: in which dataset split the noise is to be simulated.
-        :noise_transition_matrix: provides pre-defined probabilities for label flipping based on the
-        initial label value (relevant for class-dependent label noise simulation).
+        Args:
+            label_type: the type of labels for which the noise should be simulated.
+            labels: an array with unique labels of said type (retrievable from label dictionary).
+            noise_share: the desired share of noise in the train split.
+            split: in which dataset split the noise is to be simulated.
+            noise_transition_matrix: provides pre-defined probabilities for label flipping based on the initial
+                label value (relevant for class-dependent label noise simulation).
         """
         import numpy as np
 
@@ -1664,7 +1676,14 @@ def get_all_sentences(self) -> ConcatDataset:
 
     @deprecated(version="0.8", reason="Use 'make_label_dictionary' instead.")
     def make_tag_dictionary(self, tag_type: str) -> Dictionary:
-        # Make the tag dictionary
+        """Create a tag dictionary of a given label type.
+
+        Args:
+            tag_type: the label type to gather the tag labels
+
+        Returns: A Dictionary containing the labeled tags, including "O" and "<START>" and "<STOP>"
+
+        """
         tag_dictionary: Dictionary = Dictionary(add_unk=False)
         tag_dictionary.add_item("O")
         for sentence in _iter_dataset(self.get_all_sentences()):
@@ -1729,7 +1748,6 @@ class ConcatFlairDataset(Dataset):
     This class is useful to assemble different existing datasets.
 
     Args:
-    ----
         datasets (sequence): List of datasets to be concatenated
     """
 
diff --git a/flair/datasets/base.py b/flair/datasets/base.py
index f5550b5bc..ca6ead406 100644
--- a/flair/datasets/base.py
+++ b/flair/datasets/base.py
@@ -4,7 +4,7 @@
 from typing import Generic, List, Optional, Union
 
 import torch.utils.data.dataloader
-from deprecated import deprecated
+from deprecated.sphinx import deprecated
 
 from flair.data import DT, FlairDataset, Sentence, Tokenizer
 from flair.tokenization import SegtokTokenizer, SpaceTokenizer
@@ -44,7 +44,8 @@ class FlairDatapointDataset(FlairDataset, Generic[DT]):
     def __init__(self, datapoints: Union[DT, List[DT]]) -> None:
         """Instantiate FlairDatapointDataset.
 
-        :param sentences: DT or List of DT that make up FlairDatapointDataset
+        Args:
+            datapoints: DT or List of DT that make up FlairDatapointDataset
         """
         # cast to list if necessary
         if not isinstance(datapoints, list):
@@ -77,11 +78,11 @@ def __init__(
     ) -> None:
         """Instantiate StringDataset.
 
-        :param texts: a string or List of string that make up StringDataset
-        :param use_tokenizer: Custom tokenizer to use (default is SpaceTokenizer,
-        more advanced options are SegTokTokenizer to use segtok or SpacyTokenizer to use Spacy library models
-        if available). Check the code of subclasses of Tokenizer to implement your own (if you need it).
-        If instead of providing a function, this parameter is just set to True, SegTokTokenizer will be used.
+        Args:
+            texts: a string or List of string that make up StringDataset
+            use_tokenizer:
+                Custom tokenizer to use. If instead of providing a function, this parameter is just set to True,
+                :class:`flair.tokenization.SegTokTokenizer` will be used.
         """
         # cast to list if necessary
         if isinstance(texts, str):
@@ -130,19 +131,22 @@ def __init__(
         'Plats': 'Abrahamsby'
         }
 
-        :param query: Query, e.g. {'Län': 'Stockholms län'}
-        :param host: Host, e.g. 'localhost',
-        :param port: Port, e.g. 27017
-        :param database: Database, e.g. 'rosenberg',
-        :param collection: Collection, e.g. 'book',
-        :param text_field: Text field, e.g. 'Beskrivning',
-        :param categories_field: List of category fields, e.g ['Län', 'Härad', 'Tingslag', 'Församling', 'Plats'],
-        :param max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is.
-        :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
-        :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
-        :param tokenizer: Custom tokenizer to use (default SegtokTokenizer)
-        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
-        :return: list of sentences
+        Args:
+            query: Query, e.g. {'Län': 'Stockholms län'}
+            host: Host, e.g. 'localhost',
+            port: Port, e.g. 27017
+            database: Database, e.g. 'rosenberg',
+            collection: Collection, e.g. 'book',
+            text_field: Text field, e.g. 'Beskrivning',
+            categories_field: List of category fields, e.g ['Län', 'Härad', 'Tingslag', 'Församling', 'Plats'],
+            max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is.
+            max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
+            max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
+            tokenizer: Custom tokenizer to use (default SegtokTokenizer)
+            in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
+            tag_type: The tag type to assign labels to.
+
+        Returns: list of sentences
         """
         # first, check if pymongo is installed
         try:
diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 0a5141942..2ce701d51 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -22,7 +22,7 @@
 from zipfile import BadZipFile, LargeZipFile
 
 import ftfy
-from deprecated import deprecated
+from deprecated.sphinx import deprecated
 from lxml import etree
 from lxml.etree import XMLSyntaxError
 
@@ -331,8 +331,8 @@ def __init__(
     ) -> None:
         """Initialize CoNLLWriter.
 
-        :param sentence_splitter: Implementation of :class:`SentenceSplitter` which
-        segments the text into sentences and tokens
+        Args:
+            sentence_splitter: Sentence splitter which segments the text into sentences and tokens.
         """
         self.sentence_splitter = sentence_splitter
 
@@ -408,15 +408,12 @@ class HunerDataset(ColumnCorpus, ABC):
     """Base class for HUNER datasets.
 
     Every subclass has to implement the following methods:
-      - `to_internal', which reads the complete data set (incl. train, dev, test) and returns the corpus
-        as InternalBioNerDataset
-      - `split_url', which returns the base url (i.e. without '.train', '.dev', '.test') to the HUNER split files
+      - "to_internal", which reads the complete data set (incl. train, dev, test) and returns the corpus as InternalBioNerDataset
+      - "split_url", which returns the base url (i.e. without '.train', '.dev', '.test') to the HUNER split files
 
     For further information see:
-      - Weber et al.: 'HUNER: improving biomedical NER with pretraining'
-        https://academic.oup.com/bioinformatics/article-abstract/36/1/295/5523847?redirectedFrom=fulltext
-      - HUNER github repository:
-        https://github.com/hu-ner/huner
+      - Weber et al.: 'HUNER: improving biomedical NER with pretraining' https://academic.oup.com/bioinformatics/article-abstract/36/1/295/5523847?redirectedFrom=fulltext
+      - HUNER github repository: https://github.com/hu-ner/huner
     """
 
     @abstractmethod
diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index 32fb5e64c..2c0d6b341 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -44,22 +44,20 @@ def __init__(
     ) -> None:
         """Instantiates a Corpus from text classification-formatted task data.
 
-        :param data_folder: base folder with the task data
-        :param label_type: name of the label
-        :param train_file: the name of the train file
-        :param test_file: the name of the test file
-        :param dev_file: the name of the dev file, if None, dev data is sampled from train
-        :param truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of tokens
-        :param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars
-        :param filter_if_longer_than: If set, filters documents that are longer that the specified number of tokens.
-        :param tokenizer: Tokenizer for dataset, default is SegtokTokenizer
-        :param memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full'
-        if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if
-        even this is too much for your memory, use 'disk'.
-        :param label_name_map: Optionally map label names to different schema.
-        :param allow_examples_without_labels: set to True to allow Sentences without label in the corpus.
-        :param encoding: Default is 'utf-8' but some datasets are in 'latin-1
-        :return: a Corpus with annotated train, dev and test data
+        Args:
+            data_folder: base folder with the task data
+            label_type: name of the label
+            train_file: the name of the train file
+            test_file: the name of the test file
+            dev_file: the name of the dev file, if None, dev data is sampled from train
+            truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of tokens
+            truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars
+            filter_if_longer_than: If set, filters documents that are longer that the specified number of tokens.
+            tokenizer: Tokenizer for dataset, default is SegtokTokenizer
+            memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full' if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if even this is too much for your memory, use 'disk'.
+            label_name_map: Optionally map label names to different schema.
+            allow_examples_without_labels: set to True to allow Sentences without label in the corpus.
+            encoding: Default is 'utf-8' but some datasets are in 'latin-1
         """
         # find train, dev and test files if not specified
         dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file)
@@ -759,14 +757,13 @@ def __init__(
     ) -> None:
         """Initialize the IMDB move review sentiment corpus.
 
-        :param base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default.
-        :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
-        :param rebalance_corpus: Default splits for this corpus have a strange 50/50 train/test split that are impractical.
-        With rebalance_corpus=True (default setting), corpus is rebalanced to a 80/10/10 train/dev/test split. If you
-        want to use original splits, set this to False.
-        :param memory_mode: Set to 'partial' because this is a huge corpus, but you can also set to 'full' for faster
+        Args:
+            base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default.
+            tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
+            rebalance_corpus: Weather to use a 80/10/10 data split instead of the original 50/0/50 split.
+            memory_mode: Set to 'partial' because this is a huge corpus, but you can also set to 'full' for faster
          processing or 'none' for less memory.
-        :param corpusargs: Other args for ClassificationCorpus.
+            corpusargs: Other args for ClassificationCorpus.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
diff --git a/flair/datasets/entity_linking.py b/flair/datasets/entity_linking.py
index bac2beb13..a515e0f3a 100644
--- a/flair/datasets/entity_linking.py
+++ b/flair/datasets/entity_linking.py
@@ -797,10 +797,11 @@ def __init__(
         see https://arxiv.org/abs/2101.01228v2
 
         The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -940,12 +941,12 @@ def __init__(
     def _text_to_cols(self, sentence: Sentence, links: list, outfile):
         """Convert a tokenized sentence into column format.
 
-        :param sentence: Flair Sentence object containing a tokenized post title or comment thread
-        :param links: array containing information about the starting and ending position of an entity mention, as well
-        as its corresponding wiki tag
-        :param outfile: file, to which the output is written
+        Args:
+            sentence: Flair Sentence object containing a tokenized post title or comment thread
+            links: array containing information about the starting and ending position of an entity mention, as well as its corresponding wiki tag
+            outfile: file, to which the output is written
         """
-        for i in range(0, len(sentence)):
+        for i in range(len(sentence)):
             # If there are annotated entity mentions for given post title or a comment thread
             if links:
                 # Keep track which is the correct corresponding entity link, in cases where there is >1 link in a sentence
@@ -1002,10 +1003,10 @@ def _text_to_cols(self, sentence: Sentence, links: list, outfile):
     def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> list:
         """Fills the array containing information about the entity mention annotations.
 
-        :param annot_array: array to be filled
-        :param key: reddit id, on which the post title/comment thread is matched with its corresponding annotation
-        :param post_flag: flag indicating whether the annotations are collected for the post titles (=True)
-        or comment threads (=False)
+        Args:
+            annot_array: array to be filled
+            key: reddit id, on which the post title/comment thread is matched with its corresponding annotation
+            post_flag: flag indicating whether the annotations are collected for the post titles or comment threads
         """
         while True:
             # Check if further annotations belong to the current post title or comment thread as well
@@ -1024,8 +1025,8 @@ def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> lis
     def _fill_curr_comment(self, fix_flag: bool):
         """Extends the string containing the current comment thread, which is passed to _text_to_cols method, when the comments are parsed.
 
-        :param fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed (=True)
-        or regular rows (=False)
+        Args:
+            fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed or regular rows
         """
         next_row = None
         while True:
@@ -1151,19 +1152,18 @@ def split_span(word_fields: List[str], datasetname: str):
                     txt_out.write("\n")
 
 
-def determine_tsv_file(filename: str, data_folder: Path, cut_multisense: bool = True):
+def determine_tsv_file(filename: str, data_folder: Path, cut_multisense: bool = True) -> str:
     """Checks if the converted .tsv file already exists and if not, creates it.
 
-    Returns name of the file.
-    ----------
-    string : str
-        String that contains the name of the file.
-    data_folder : str
-        String that contains the name of the folder in which the CoNLL file should reside.
-    cut_multisense : bool, optional
-        Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses.
-        If True only the first listed sense will be used. Otherwise the whole list of senses will be detected
-        as one new sense. The default is True.
+    Args:
+        filename: The name of the file.
+        data_folder: The name of the folder in which the CoNLL file should reside.
+        cut_multisense: Determines whether the wn30_key tag should be cut if it contains multiple possible senses.
+            If True only the first listed sense will be used. Otherwise, the whole list of senses will be detected
+            as one new sense. The default is True.
+
+    Returns:
+        the name of the file.
     """
     if cut_multisense is True and filename not in [
         "semeval2007task17",
@@ -1211,27 +1211,18 @@ def __init__(
         If the constructor is called for the first time the data is automatically downloaded and transformed from xml to a tab separated column format.
         Since only the WordNet 3.0 version for senses is consistently available for all provided datasets we will only consider this version.
         Also we ignore the id annotation used in datasets that were originally created for evaluation tasks
-        :param filenames: Here you can pass a single datasetname or a list of ddatasetnames. The available names are:
-            'masc', 'omsti', 'raganato_ALL', 'raganato_semeval2007', 'raganato_semeval2013', 'raganato_semeval2015', 'raganato_senseval2', 'raganato_senseval3',
-            'semcor', 'semeval2007task17', 'semeval2007task7', 'semeval2013task12', 'semeval2015task13', 'senseval2', 'senseval2_lexical_sample_test',
-            'senseval2_lexical_sample_train', 'senseval3task1', 'senseval3task6_test', 'senseval3task6_train', 'trainomatic', 'wngt'.
-            So you can pass for example filenames = ['masc', 'omsti', 'wngt']. Default two mid-sized datasets 'masc' and 'semcor' are loaded.
-        :param base_path: You can override this to point to a specific folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        :param cut_multisense: Boolean that determines whether or not the wn30_key tag should be cut if it contains
-                               multiple possible senses. If True only the first listed sense will be used and the
-                               suffix '_cut' will be added to the name of the CoNLL file. Otherwise the whole list of
-                               senses will be detected as one new sense. The default is True.
-        :param columns: Columns to consider when loading the dataset. You can add 1: "lemma" or 2: "pos" to the default dict {0: "text", 3: "sense"}
-            if you want to use additional pos and/or lemma for the words.
-        :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
-        :param sample_missing_splits_in_multicorpus: Whether to sample missing splits when loading the multicorpus (this is redundant if
-                                                                                                                    sample_missing_splits_in_each_corpus is True)
-        :param sample_missing_splits_in_each_corpus: Whether to sample missing splits when loading each single corpus given in filenames.
-        :param use_raganato_ALL_as_test_data: If True, the raganato_ALL dataset (Raganato et al. "Word Sense Disambiguation: A unified evaluation framework and empirical compariso")
-            will be used as test data. Note that the sample_missing_splits parameters are set to 'only_dev' in this case if set to True.
-        :param name: Name of your (costum) corpus
+
+        Args:
+            filenames: Here you can pass a single datasetname or a list of datasetnames. The available names are: 'masc', 'omsti', 'raganato_ALL', 'raganato_semeval2007', 'raganato_semeval2013', 'raganato_semeval2015', 'raganato_senseval2', 'raganato_senseval3', 'semcor', 'semeval2007task17', 'semeval2007task7', 'semeval2013task12', 'semeval2015task13', 'senseval2', 'senseval2_lexical_sample_test', 'senseval2_lexical_sample_train', 'senseval3task1', 'senseval3task6_test', 'senseval3task6_train', 'trainomatic', 'wngt',
+            base_path: You can override this to point to a specific folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            cut_multisense: Boolean that determines whether the wn30_key tag should be cut if it contains multiple possible senses. If True only the first listed sense will be used and the suffix '_cut' will be added to the name of the CoNLL file. Otherwise the whole list of senses will be detected as one new sense. The default is True.
+            columns: Columns to consider when loading the dataset. You can add 1: "lemma" or 2: "pos" to the default dict {0: "text", 3: "sense"} if you want to use additional pos and/or lemma for the words.
+            banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
+            sample_missing_splits_in_multicorpus: Whether to sample missing splits when loading the multicorpus (this is redundant if sample_missing_splits_in_each_corpus is True)
+            sample_missing_splits_in_each_corpus: Whether to sample missing splits when loading each single corpus given in filenames.
+            use_raganato_ALL_as_test_data: If True, the raganato_ALL dataset (Raganato et al. "Word Sense Disambiguation: A unified evaluation framework and empirical compariso") will be used as test data. Note that the sample_missing_splits parameters are set to 'only_dev' in this case if set to True.
+            name: Name of your corpus
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
diff --git a/flair/datasets/relation_extraction.py b/flair/datasets/relation_extraction.py
index 0421802de..30709a14c 100644
--- a/flair/datasets/relation_extraction.py
+++ b/flair/datasets/relation_extraction.py
@@ -44,9 +44,6 @@ def __init__(
         """SemEval-2010 Task 8 on Multi-Way Classification of Semantic Relations Between Pairs of Nominals.
 
         see https://aclanthology.org/S10-1006.pdf
-        :param base_path:
-        :param in_memory:
-        :param augment_train:
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -229,8 +226,6 @@ def __init__(self, base_path: Optional[Union[str, Path]] = None, in_memory: bool
 
         with 41 relations from https://nlp.stanford.edu/projects/tacred/.
         Manual download is required for this dataset.
-        :param base_path:
-        :param in_memory:
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 0a5bf1b58..b3e40342b 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -281,20 +281,19 @@ def __init__(
     ) -> None:
         r"""Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
 
-        :param data_folder: base folder with the task data
-        :param column_format: a map specifying the column format
-        :param train_files: the name of the train files
-        :param test_files: the name of the test files
-        :param dev_files: the name of the dev files, if empty, dev data is sampled from train
-        :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
-        to split only on tabs
-        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
-        :param document_separator_token: If provided, sentences that function as document boundaries are so marked
-        :param skip_first_line: set to True if your dataset has a header line
-        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
-        :param label_name_map: Optionally map tag names to different schema.
-        :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
-        :return: a Corpus with annotated train, dev and test data
+        Args:
+            data_folder: base folder with the task data
+            column_format: a map specifying the column format
+            train_files: the name of the train files
+            test_files: the name of the test files
+            dev_files: the name of the dev files, if empty, dev data is sampled from train
+            column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs
+            comment_symbol: if set, lines that begin with this symbol are treated as comments
+            document_separator_token: If provided, sentences that function as document boundaries are so marked
+            skip_first_line: set to True if your dataset has a header line
+            in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
+            label_name_map: Optionally map tag names to different schema.
+            banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
         """
         # get train data
         train: Optional[Dataset] = (
@@ -386,20 +385,19 @@ def __init__(
     ) -> None:
         r"""Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
 
-        :param data_folder: base folder with the task data
-        :param column_format: a map specifying the column format
-        :param train_file: the name of the train file
-        :param test_file: the name of the test file
-        :param dev_file: the name of the dev file, if None, dev data is sampled from train
-        :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
-        to split only on tabs
-        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
-        :param document_separator_token: If provided, sentences that function as document boundaries are so marked
-        :param skip_first_line: set to True if your dataset has a header line
-        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
-        :param label_name_map: Optionally map tag names to different schema.
-        :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
-        :return: a Corpus with annotated train, dev and test data
+        Args:
+            data_folder: base folder with the task data
+            column_format: a map specifying the column format
+            train_file: the name of the train file
+            test_file: the name of the test file
+            dev_file: the name of the dev file, if None, dev data is sampled from train
+            column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs
+            comment_symbol: if set, lines that begin with this symbol are treated as comments
+            document_separator_token: If provided, sentences that function as document boundaries are so marked
+            skip_first_line: set to True if your dataset has a header line
+            in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
+            label_name_map: Optionally map tag names to different schema.
+            banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
         """
         # find train, dev and test files if not specified
         dev_file, test_file, train_file = find_train_dev_test_files(
@@ -440,17 +438,16 @@ def __init__(
     ) -> None:
         r"""Instantiates a column dataset.
 
-        :param path_to_column_file: path to the file with the column-formatted data
-        :param column_name_map: a map specifying the column format
-        :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
-        to split only on tabs
-        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
-        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
-        :param document_separator_token: If provided, sentences that function as document boundaries are so marked
-        :param skip_first_line: set to True if your dataset has a header line
-        :param label_name_map: Optionally map tag names to different schema.
-        :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
-        :return: a dataset with annotated data
+        Args:
+            path_to_column_file: path to the file with the column-formatted data
+            column_name_map: a map specifying the column format
+            column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs
+            comment_symbol: if set, lines that begin with this symbol are treated as comments
+            in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
+            document_separator_token: If provided, sentences that function as document boundaries are so marked
+            skip_first_line: set to True if your dataset has a header line
+            label_name_map: Optionally map tag names to different schema.
+            banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
         """
         path_to_column_file = Path(path_to_column_file)
         assert path_to_column_file.exists()
@@ -969,19 +966,11 @@ def _process_coref_span_annotations_for_word(
 
         This method mutates the clusters and coref_stacks dictionaries.
 
-        # Parameters
-        label : `str`
-            The coref label for this word.
-        word_index : `int`
-            The word index into the sentence.
-        clusters : `DefaultDict[int, List[Tuple[int, int]]]`
-            A dictionary mapping cluster ids to lists of inclusive spans into the
-            sentence.
-        coref_stacks : `DefaultDict[int, List[int]]`
-            Stacks for each cluster id to hold the start indices of active spans (spans
-            which we are inside of when processing a given word). Spans with the same id
-            can be nested, which is why we collect these opening spans on a stack, e.g:
-            [Greg, the baker who referred to [himself]_ID1 as 'the bread man']_ID1
+        Args:
+            label: The coref label for this word.
+            word_index : The word index into the sentence.
+            clusters : A dictionary mapping cluster ids to lists of inclusive spans into the sentence.
+            coref_stacks : Stacks for each cluster id to hold the start indices of open spans. Spans with the same id can be nested, which is why we collect these opening spans on a stack, e.g: [Greg, the baker who referred to [himself]_ID1 as 'the bread man']_ID1
         """
         if label != "-":
             for segment in label.split("|"):
@@ -1013,17 +1002,6 @@ def _process_span_annotations_for_word(
         span_labels: List[List[str]],
         current_span_labels: List[Optional[str]],
     ) -> None:
-        """Given a sequence of different label types for a single word and the current span label we are inside, compute the BIO tag for each label and append to a list.
-
-        # Parameters
-        annotations : `List[str]`
-            A list of labels to compute BIO tags for.
-        span_labels : `List[List[str]]`
-            A list of lists, one for each annotation, to incrementally collect
-            the BIO tags for a sequence.
-        current_span_labels : `List[Optional[str]]`
-            The currently open span per annotation type, or `None` if there is no open span.
-        """
         for annotation_index, annotation in enumerate(annotations):
             # strip all bracketing information to
             # get the actual propbank label.
@@ -1147,7 +1125,7 @@ def _conll_rows_to_sentence(cls, conll_rows: List[str]) -> Dict:
             speakers.append(speaker if speaker != "-" else None)
 
         named_entities = span_labels[0]
-        srl_frames = [(predicate, labels) for predicate, labels in zip(verbal_predicates, span_labels[1:])]
+        srl_frames = list(zip(verbal_predicates, span_labels[1:]))
 
         # this would not be reached if parse_pieces contained None, hence the cast
         parse_tree = "".join(cast(List[str], parse_pieces)) if all(parse_pieces) else None
@@ -1200,8 +1178,7 @@ def dataset_document_iterator(cls, file_path: Union[Path, str]) -> Iterator[List
     def sentence_iterator(cls, file_path: Union[Path, str]) -> Iterator:
         """An iterator over the sentences in an individual CONLL formatted file."""
         for document in cls.dataset_document_iterator(file_path):
-            for sentence in document:
-                yield sentence
+            yield from document
 
 
 class CONLL_03(ColumnCorpus):
@@ -1306,11 +1283,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -1371,10 +1346,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -1602,10 +1576,10 @@ def __init__(
         Column order is swapped
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -1746,11 +1720,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -1853,10 +1826,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         # column format
         columns = {0: "ner", 1: "text"}
@@ -1896,10 +1868,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         # column format
         columns = {0: "ner", 1: "text"}
@@ -1929,22 +1900,20 @@ def __init__(
 
 
 class NER_ENGLISH_SEC_FILLINGS(ColumnCorpus):
-    """Initialize corpus of SEC-fillings annotated with English NER tags.
-
-    See paper "Domain Adaption of Named Entity Recognition to Support Credit Risk Assessment" by Alvarado et al, 2015: https://aclanthology.org/U15-1010/
-
-    :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
-    POS tags or chunks respectively
-    :param in_memory: If True, keeps dataset in memory giving speedups in training.
-    :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-    """
-
     def __init__(
         self,
         base_path: Optional[Union[str, Path]] = None,
         in_memory: bool = True,
         **corpusargs,
     ) -> None:
+        """Initialize corpus of SEC-fillings annotated with English NER tags.
+
+        See paper "Domain Adaption of Named Entity Recognition to Support Credit Risk Assessment" by Alvarado et al, 2015: https://aclanthology.org/U15-1010/
+
+        Args:
+            base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
         # column format
@@ -2024,11 +1993,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2118,10 +2086,10 @@ def __init__(
         The corpus will be downoaded from https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2160,10 +2128,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2212,11 +2179,11 @@ def __init__(
 
         The corpus was introduced in the paper "Design Challenges and Misconceptions in Named Entity Recognition" by Ratinov and Roth (2009): https://aclanthology.org/W09-1119/.
         The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-            POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2279,10 +2246,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2340,10 +2307,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2457,10 +2424,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2522,7 +2489,7 @@ def _add_IOB_tags(self, data_file: Union[str, Path], encoding: str = "utf8", ner
         """
 
         def add_I_prefix(current_line: List[str], ner: int, tag: str):
-            for i in range(0, len(current_line)):
+            for i in range(len(current_line)):
                 if i == 0:
                     f.write(line_list[i])
                 elif i == ner:
@@ -2540,7 +2507,7 @@ def add_I_prefix(current_line: List[str], ner: int, tag: str):
                 if len(line_list) > 2:  # word with tags
                     ner_tag = line_list[ner_column]
                     if ner_tag in ["0", "O"]:  # no chunk
-                        for i in range(0, len(line_list)):
+                        for i in range(len(line_list)):
                             if i == 0:
                                 f.write(line_list[i])
                             elif i == ner_column:
@@ -2891,9 +2858,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -3088,10 +3055,10 @@ def __init__(
     ) -> None:
         """Download and Initialize the MultiCoNer corpus.
 
-        :param task: either 'multi', 'code-switch', or the language code for one of the mono tasks.
-        :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
-        POS tags or chunks respectively
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        Args:
+            task: either 'multi', 'code-switch', or the language code for one of the mono tasks.
+            base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine POS tags or chunks respectively
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4148,10 +4115,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4195,10 +4162,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4243,10 +4210,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4290,10 +4257,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4337,10 +4304,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4384,10 +4351,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4431,10 +4398,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4478,10 +4445,10 @@ def __init__(
 
         The dataset is downloaded from https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4741,10 +4708,10 @@ def __init__(
         - Domain-specific classification (DSC). Participants will be asked to deploy a different model for each of the above types,
           trying to increase the accuracy for each considered type.
 
-        :param domains: Domains to be used. Supported are "WN" (Wikinews), "FIC" (fiction), "ADG" (De Gasperi subset) and "all".
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        Args:
+            domains: Domains to be used. Supported are "WN" (Wikinews), "FIC" (fiction), "ADG" (De Gasperi subset) and "all".
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
         """
         supported_domains = ["WN", "FIC", "ADG"]
 
diff --git a/flair/datasets/text_text.py b/flair/datasets/text_text.py
index e1727e0cc..e96a5317e 100644
--- a/flair/datasets/text_text.py
+++ b/flair/datasets/text_text.py
@@ -327,18 +327,18 @@ def __init__(
         seperated by e.g. '\t' (just like in the glue RTE-dataset https://gluebenchmark.com/tasks) .
         For each data pair we create a flair.data.DataPair object.
 
-        :param path_to_data: path to the data file
-        :param columns: list of integers that indicate the respective columns. The first entry is the column
-        for the first sentence, the second for the second sentence and the third for the label. Default [0,1,2]
-        :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens
-        :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters
-        :param use_tokenizer: Whether or not to use in-built tokenizer
-        :param in_memory: If True, data will be saved in list of flair.data.DataPair objects, other wise we use lists with simple strings which needs less space
-        :param label_type: Name of the label of the data pairs
-        :param skip_first_line: If True, first line of data file will be ignored
-        :param separator: Separator between columns in the data file
-        :param encoding: Encoding of the data file
-        :param label: If False, the dataset expects unlabeled data
+        Args:
+            path_to_data: path to the data file
+            columns: list of integers that indicate the respective columns. The first entry is the column for the first sentence, the second for the second sentence and the third for the label. Default [0,1,2]
+            max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens
+            max_chars_per_doc: If set, shortens sentences to this maximum number of characters
+            use_tokenizer: Whether to use in-built tokenizer
+            in_memory: If True, data will be saved in list of flair.data.DataPair objects, otherwise we use lists with simple strings which needs less space
+            label_type: Name of the label of the data pairs
+            skip_first_line: If True, first line of data file will be ignored
+            separator: Separator between columns in the data file
+            encoding: Encoding of the data file
+            label: If False, the dataset expects unlabeled data
         """
         path_to_data = Path(path_to_data)
 
diff --git a/flair/embeddings/base.py b/flair/embeddings/base.py
index bf3e7645a..d3cacd75a 100644
--- a/flair/embeddings/base.py
+++ b/flair/embeddings/base.py
@@ -123,7 +123,10 @@ def __init__(self, mixture_size: int, trainable: bool = False) -> None:
         """Inits scalar mix implementation.
 
         ``mixture = gamma * sum(s_k * tensor_k)`` where ``s = softmax(w)``, with ``w`` and ``gamma`` scalar parameters.
-        :param mixture_size: size of mixtures (usually the number of layers)
+
+        Args:
+            mixture_size: size of mixtures (usually the number of layers)
+            trainable: weather or not the weights should be learnable.
         """
         super().__init__()
         self.mixture_size = mixture_size
@@ -157,8 +160,11 @@ def forward(self, tensors: List[torch.Tensor]) -> torch.Tensor:
 
         Computes a weighted average of the ``tensors``.  The input tensors an be any shape
         with at least two dimensions, but must all be the same shape.
-        :param tensors: list of input tensors
-        :return: computed weighted average of input tensors
+
+        Args:
+            tensors: list of input tensors
+
+        Returns: computed weighted average of input tensors
         """
         if len(tensors) != self.mixture_size:
             log.error(
diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py
index 0fdfbcd70..8779d418e 100644
--- a/flair/embeddings/document.py
+++ b/flair/embeddings/document.py
@@ -37,12 +37,14 @@ def __init__(
     ) -> None:
         """Bidirectional transformer embeddings of words from various transformer architectures.
 
-        :param model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for
-        options)
-        :param layers: string indicating which layers to take for embedding (-1 is topmost layer)
-        :param cls_pooling: Pooling strategy for combining token level embeddings. options are 'cls', 'max', 'mean'.
-        :param layer_mean: If True, uses a scalar mix of layers as embedding
-        :param fine_tune: If True, allows transformers to be fine-tuned during training
+        Args:
+            model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options)
+            layers: string indicating which layers to take for embedding (-1 is topmost layer)
+            cls_pooling: Pooling strategy for combining token level embeddings. options are 'cls', 'max', 'mean'.
+            layer_mean: If True, uses a scalar mix of layers as embedding
+            fine_tune: If True, allows transformers to be fine-tuned during training
+            is_token_embedding: If True, the embedding can be used as TokenEmbedding too.
+            **kwargs: Arguments propagated to :meth:`flair.embeddings.transformer.TransformerEmbeddings.__init__`
         """
         TransformerEmbeddings.__init__(
             self,
@@ -71,10 +73,10 @@ def __init__(
     ) -> None:
         """The constructor takes a list of embeddings to be combined.
 
-        :param embeddings: a list of token embeddings
-        :param fine_tune_mode: if set to "linear" a trainable layer is added, if set to
-        "nonlinear", a nonlinearity is added as well. Set this to make the pooling trainable.
-        :param pooling: a string which can any value from ['mean', 'max', 'min']
+        Args:
+            embeddings: a list of token embeddings
+            fine_tune_mode: if set to "linear" a trainable layer is added, if set to "nonlinear", a nonlinearity is added as well. Set this to make the pooling trainable.
+            pooling: a string which can any value from ['mean', 'max', 'min']
         """
         super().__init__()
 
@@ -173,8 +175,10 @@ def __init__(
     ) -> None:
         """The constructor for DocumentTFIDFEmbeddings.
 
-        :param train_dataset: the train dataset which will be used to construct a vectorizer
-        :param vectorizer_params: parameters given to Scikit-learn's TfidfVectorizer constructor
+        Args:
+            train_dataset: the train dataset which will be used to construct a vectorizer
+            vectorizer: a precalculated vectorizer. If provided, requires the train_dataset to be an empty list.
+            vectorizer_params: parameters given to Scikit-learn's TfidfVectorizer constructor
         """
         super().__init__()
 
@@ -237,23 +241,23 @@ def __init__(
         dropout: float = 0.5,
         word_dropout: float = 0.0,
         locked_dropout: float = 0.0,
-        rnn_type="GRU",
+        rnn_type: str = "GRU",
         fine_tune: bool = True,
     ) -> None:
         """Instantiates an RNN that works upon some token embeddings.
 
-        :param embeddings: a list of token embeddings
-        :param hidden_size: the number of hidden states in the rnn
-        :param rnn_layers: the number of layers for the rnn
-        :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
-        layer before putting them into the rnn or not
-        :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
-        dimension as before will be taken.
-        :param bidirectional: boolean value, indicating whether to use a bidirectional rnn or not
-        :param dropout: the dropout value to be used
-        :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
-        :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
-        :param rnn_type: 'GRU' or 'LSTM'
+        Args:
+            embeddings: a list of token embeddings
+            hidden_size: the number of hidden states in the rnn
+            rnn_layers: the number of layers for the rnn
+            reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear layer before putting them into the rnn or not
+            reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output dimension as before will be taken.
+            bidirectional: boolean value, indicating whether to use a bidirectional rnn or not
+            dropout: the dropout value to be used
+            word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
+            locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
+            rnn_type: 'GRU' or 'LSTM'
+            fine_tune: if True, allow to finetune the embeddings.
         """
         super().__init__()
 
@@ -539,9 +543,9 @@ def __init__(
     ) -> None:
         """Instantiates a document embedding using the SentenceTransformer Embeddings.
 
-        :param model: string name of models from SentencesTransformer Class
-        :param name: string name of embedding type which will be set to Sentence object
-        :param batch_size: int number of sentences to processed in one batch
+        Args:
+            model: string name of models from SentencesTransformer Class
+            batch_size: int number of sentences to processed in one batch
         """
         super().__init__()
 
@@ -611,17 +615,17 @@ def __init__(
         locked_dropout: float = 0.0,
         fine_tune: bool = True,
     ) -> None:
-        """Instantiates a CNN that works uppons some token embeddings.
-
-        :param embeddings: a list of token embeddings
-        :param kernels: list of (number of kernels, kernel size)
-        :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
-        layer before putting them into the rnn or not
-        :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
-        dimension as before will be taken.
-        :param dropout: the dropout value to be used
-        :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
-        :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
+        """Instantiates a CNN that works upon some token embeddings.
+
+        Args:
+            embeddings: a list of token embeddings
+            kernels: list of (number of kernels, kernel size)
+            reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear layer before putting them into the rnn or not
+            reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output dimension as before will be taken.
+            dropout: the dropout value to be used
+            word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
+            locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
+            fine_tune: if True, allow to finetune the embeddings.
         """
         super().__init__()
 
diff --git a/flair/embeddings/legacy.py b/flair/embeddings/legacy.py
index 8ef43d239..6096fa758 100644
--- a/flair/embeddings/legacy.py
+++ b/flair/embeddings/legacy.py
@@ -4,7 +4,7 @@
 from typing import List, Optional, Union
 
 import torch
-from deprecated import deprecated
+from deprecated.sphinx import deprecated
 
 import flair
 from flair.data import Sentence, Token
@@ -172,14 +172,16 @@ def __init__(
         cache_directory: Optional[Path] = None,
     ) -> None:
         """Initializes contextual string embeddings using a character-level language model.
-        :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
+
+        Args:
+            model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
                 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward'
                 depending on which character language model is desired.
-        :param detach: if set to False, the gradient will propagate into the language model. this dramatically slows down
+            detach: if set to False, the gradient will propagate into the language model. this dramatically slows down
                 training and often leads to worse results, so not recommended.
-        :param use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will
+            use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will
                 not allow re-use of once computed embeddings that do not fit into memory
-        :param cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache
+            cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache
                 is written to the provided directory.
         """
         super().__init__()
@@ -527,17 +529,17 @@ def __init__(
         locked_dropout: float = 0.0,
     ) -> None:
         """The constructor takes a list of embeddings to be combined.
-        :param embeddings: a list of token embeddings
-        :param hidden_size: the number of hidden states in the lstm
-        :param rnn_layers: the number of layers for the lstm
-        :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
-        layer before putting them into the lstm or not
-        :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
-        dimension as before will be taken.
-        :param bidirectional: boolean value, indicating whether to use a bidirectional lstm or not
-        :param dropout: the dropout value to be used
-        :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
-        :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used.
+
+        Args:
+            embeddings: a list of token embeddings
+            hidden_size: the number of hidden states in the lstm
+            rnn_layers: the number of layers for the lstm
+            reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear layer before putting them into the lstm or not.
+            reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output dimension as before will be taken.
+            bidirectional: boolean value, indicating whether to use a bidirectional lstm or not
+            dropout: the dropout value to be used
+            word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
+            locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used.
         """
         super().__init__()
 
diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
index 6dd221933..b4e3edc1f 100644
--- a/flair/embeddings/token.py
+++ b/flair/embeddings/token.py
@@ -11,6 +11,7 @@
 import numpy as np
 import torch
 from bpemb import BPEmb
+from deprecated.sphinx import deprecated
 from gensim.models import KeyedVectors
 from gensim.models.fasttext import FastTextKeyedVectors, load_facebook_vectors
 from torch import nn
@@ -40,13 +41,11 @@ def __init__(
     ) -> None:
         """Bidirectional transformer embeddings of words from various transformer architectures.
 
-        :param model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for
-        options)
-        :param layers: string indicating which layers to take for embedding (-1 is topmost layer)
-        :param subtoken_pooling: how to get from token piece embeddings to token embedding. Either take the first
-        subtoken ('first'), the last subtoken ('last'), both first and last ('first_last') or a mean over all ('mean')
-        :param layer_mean: If True, uses a scalar mix of layers as embedding
-        :param fine_tune: If True, allows transformers to be fine-tuned during training
+        Args:
+            model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options)
+            is_document_embedding: If True, the embedding can be used as DocumentEmbedding too.
+            allow_long_sentences: If True, too long sentences will be patched and strided and afterwards combined.
+            **kwargs: Arguments propagated to :meth:`flair.embeddings.transformer.TransformerEmbeddings.__init__`
         """
         TransformerEmbeddings.__init__(
             self,
@@ -165,9 +164,17 @@ def __init__(
         """Initializes classic word embeddings.
 
         Constructor downloads required files if not there.
-        :param embeddings: one of: 'glove', 'extvec', 'crawl' or two-letter language code or custom
-        If you want to use a custom embedding file, just pass the path to the embeddings as embeddings variable.
-        set stable=True to use the stable embeddings as described in https://arxiv.org/abs/2110.02861
+
+        Args:
+            embeddings: one of: 'glove', 'extvec', 'crawl' or two-letter language code or a path to a custom embedding
+            field: if given, the word-embeddings embed the data for the specific label-type instead of the plain text.
+            fine_tune: If True, allows word-embeddings to be fine-tuned during training
+            force_cpu: If True, stores the large embedding matrix not on the gpu to save memory. `force_cpu` can only be used if `fine_tune` is False
+            stable: if True, use the stable embeddings as described in https://arxiv.org/abs/2110.02861
+            no_header: only for reading plain word2vec text files. If true, the reader assumes the first line to not contain embedding length and vocab size.
+            vocab: If the embeddings are already loaded in memory, provide the vocab here.
+            embedding_length: If the embeddings are already loaded in memory, provide the embedding_length here.
+            name: The name of the embeddings.
         """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
 
@@ -569,18 +576,21 @@ def __init__(
     ) -> None:
         """Initializes contextual string embeddings using a character-level language model.
 
-        :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
-                'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward',
-                etc (see https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md)
-                depending on which character language model is desired.
-        :param fine_tune: if set to True, the gradient will propagate into the language model. This dramatically slows
-                down training and often leads to overfitting, so use with caution.
-        :param chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff. Higher means faster
-                but requires more memory. Lower means slower but less memory.
-        :param with_whitespace: If True, use hidden state after whitespace after word. If False, use hidden
-                 state at last character of word.
-        :param tokenized_lm: Whether this lm is tokenized. Default is True, but for LMs trained over unprocessed text
-                False might be better.
+        Args:
+            model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
+              'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward' depending on which character language model is desired.
+            fine_tune: if set to True, the gradient will propagate into the language model.
+              This dramatically slows down training and often leads to overfitting, so use with caution.
+            chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff.
+              Higher means faster but requires more memory. Lower means slower but less memory.
+            with_whitespace: If True, use hidden state after whitespace after word.
+              If False, use hidden state at last character of word.
+            tokenized_lm: Whether this lm is tokenized. Default is True,
+              but for LMs trained over unprocessed text False might be better.
+            has_decoder: Weather to load the decoder-head of the LanguageModel. This should only be true, if you intend
+              to generate text.
+            is_lower: Whether this lm is trained on lower-cased data.
+            name: The name of the embeddings
         """
         super().__init__()
         self.instance_parameters = self.get_instance_parameters(locals=locals())
@@ -1015,8 +1025,11 @@ def __init__(
 
         Constructor downloads required embedding file and stores in cache if use_local is False.
 
-        :param embeddings: path to your embeddings '.bin' file
-        :param use_local: set this to False if you are using embeddings from a remote source
+        Args:
+            embeddings: path to your embeddings '.bin' file
+            use_local: set this to False if you are using embeddings from a remote source
+            field: if given, the word-embeddings embed the data for the specific label-type instead of the plain text.
+            name: The name of the embeddings.
         """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
 
@@ -1103,10 +1116,11 @@ def __init__(
     ) -> None:
         """Initializes one-hot encoded word embeddings and a trainable embedding layer.
 
-        :param vocab_dictionary: the vocabulary that will be encoded
-        :param field: by default, the 'text' of tokens is embedded, but you can also embed tags such as 'pos'
-        :param embedding_length: dimensionality of the trainable embedding layer
-        :param stable: set stable=True to use the stable embeddings as described in https://arxiv.org/abs/2110.02861
+        Args:
+            vocab_dictionary: the vocabulary that will be encoded
+            field: by default, the 'text' of tokens is embedded, but you can also embed tags such as 'pos'
+            embedding_length: dimensionality of the trainable embedding layer
+            stable: if True, use the stable embeddings as described in https://arxiv.org/abs/2110.02861
         """
         super().__init__()
         self.name = f"one-hot-{field}"
@@ -1352,7 +1366,15 @@ def to_params(self):
 
 
 # TODO: keep for backwards compatibility, but remove in future
+@deprecated(
+    reason="""'BPEmbSerializable' is only used in the legacy pickle-embeddings format.
+    Please save your model again to save it in the serializable json format.
+    """,
+    version="0.13.0",
+)
 class BPEmbSerializable(BPEmb):
+    """Helper class to allow pickle-seralizable BPE embeddings."""
+
     def __getstate__(self):
         state = self.__dict__.copy()
         # save the sentence piece model as binary file (not as path which may change)
@@ -1413,7 +1435,7 @@ def __init__(
             ), "Need to specify model_file_path and embedding_file_path if no language is given in BytePairEmbeddings(...)"
             dim = None  # type: ignore[assignment]
 
-        self.embedder = BPEmbSerializable(
+        self.embedder = BPEmb(
             lang=language,
             vs=syllables,
             dim=dim,
@@ -1489,9 +1511,10 @@ def __init__(self, embeddings: str, model: str = "skip", size: int = 100) -> Non
         See: http://www.nilc.icmc.usp.br/embeddings
         Constructor downloads required files if not there.
 
-        :param embeddings: one of: 'fasttext', 'glove', 'wang2vec' or 'word2vec'
-        :param model: one of: 'skip' or 'cbow'. This is not applicable to glove.
-        :param size: one of: 50, 100, 300, 600 or 1000.
+        Args:
+            embeddings: one of: 'fasttext', 'glove', 'wang2vec' or 'word2vec'
+            model: one of: 'skip' or 'cbow'. This is not applicable to glove.
+            size: one of: 50, 100, 300, 600 or 1000.
         """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
 
@@ -1549,3 +1572,19 @@ def replace_with_language_code(string: str):
     string = string.replace("spanish-", "es-")
     string = string.replace("swedish-", "sv-")
     return string
+
+
+__all__ = [
+    "TransformerWordEmbeddings",
+    "StackedEmbeddings",
+    "WordEmbeddings",
+    "CharacterEmbeddings",
+    "FlairEmbeddings",
+    "PooledFlairEmbeddings",
+    "FastTextEmbeddings",
+    "OneHotEmbeddings",
+    "HashEmbeddings",
+    "MuseCrosslingualEmbeddings",
+    "BytePairEmbeddings",
+    "NILCEmbeddings",
+]
diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index 9d2a8b5ab..b3b838507 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -1387,7 +1387,11 @@ def export_onnx(
     ) -> TransformerOnnxEmbeddings:
         """Export TransformerEmbeddings to OnnxFormat.
 
-        :param example_sentences: a list of sentences that will be used for tracing. It is recommended to take 2-4
-        sentences with some variation.
+        Args:
+            path: the path to save the embeddings. Notice that the embeddings are stored as external file,
+              hence it matters if the path is an absolue path or a relative one.
+            example_sentences: a list of sentences that will be used for tracing. It is recommended to take 2-4
+                sentences with some variation.
+            **kwargs: the parameters passed to :meth:`TransformerOnnxEmbeddings.export_from_embedding`
         """
         return self.onnx_cls.export_from_embedding(path, self, example_sentences, **kwargs)
diff --git a/flair/file_utils.py b/flair/file_utils.py
index edad5c71b..7f0ba5f9e 100644
--- a/flair/file_utils.py
+++ b/flair/file_utils.py
@@ -30,11 +30,12 @@
 
 
 def set_proxies(proxies: typing.Dict[str, str]) -> None:
-    """Allows for data downloaded from urls to be forwarded to a proxy.
+    r"""Allows for data downloaded from urls to be forwarded to a proxy.
 
     see https://requests.readthedocs.io/en/latest/user/advanced/#proxies
-    :param proxies: A dictionary of proxies according to the requests documentation.
-    :return: None
+
+    Args:
+        proxies: A dictionary of proxies according to the requests documentation.
     """
     global url_proxies
     url_proxies = proxies
@@ -44,8 +45,6 @@ def load_big_file(f: str):
     """Workaround for loading a big pickle file.
 
     Files over 2GB cause pickle errors on certain Mac and Windows distributions.
-    :param f:
-    :return:
     """
     with open(f, "rb") as f_in:
         # mmap seems to be much more memory efficient
@@ -147,10 +146,11 @@ def unzip_file(file: Union[str, Path], unzip_to: Union[str, Path]):
 def unpack_file(file: Path, unpack_to: Path, mode: Optional[str] = None, keep: bool = True):
     """Unpacks an archive file to the given location.
 
-    :param file Archive file to unpack
-    :param unpack_to Destination where to store the output
-    :param mode Type of the archive (zip, tar, gz, targz, rar)
-    :param keep Indicates whether to keep the archive after extraction or delete it
+    Args:
+        file: Archive file to unpack
+        unpack_to: Destination where to store the output
+        mode: Type of the archive (zip, tar, gz, targz, rar)
+        keep: Indicates whether to keep the archive after extraction or delete it
     """
     if mode == "zip" or (mode is None and str(file).endswith("zip")):
         from zipfile import ZipFile
diff --git a/flair/inference_utils.py b/flair/inference_utils.py
index 035025c00..031067153 100644
--- a/flair/inference_utils.py
+++ b/flair/inference_utils.py
@@ -64,10 +64,10 @@ class WordEmbeddingsStore:
     def __init__(self, embedding: WordEmbeddings, backend="sqlite", verbose=True) -> None:
         """Instantiates the WordEmbeddingsStore.
 
-        :param embedding: Flair WordEmbeddings instance.
-        :param backend: cache database backend name e.g ``'sqlite'``, ``'lmdb'``.
-                        Default value is ``'sqlite'``.
-        :param verbose: If `True` print information on standard output
+        Args:
+            embedding: The WordEmbeddings to store.
+            backend: cache database backend name e.g ``'sqlite'``, ``'lmdb'``. Default value is ``'sqlite'``.
+            verbose: If `True` print information on standard output
         """
         self.items = ""
 
diff --git a/flair/models/clustering.py b/flair/models/clustering.py
index 00eb88563..e9902f6f6 100644
--- a/flair/models/clustering.py
+++ b/flair/models/clustering.py
@@ -22,8 +22,9 @@ class ClusteringModel:
     def __init__(self, model: Union[ClusterMixin, BaseEstimator], embeddings: DocumentEmbeddings) -> None:
         """Instantiate the ClusteringModel.
 
-        :param model: the clustering algorithm from sklearn this wrapper will use.
-        :param embeddings: the flair DocumentEmbedding this wrapper uses to calculate a vector for each sentence.
+        Args:
+            model: the clustering algorithm from sklearn this wrapper will use.
+            embeddings: the flair DocumentEmbedding this wrapper uses to calculate a vector for each sentence.
         """
         self.model = model
         self.embeddings = embeddings
@@ -31,7 +32,9 @@ def __init__(self, model: Union[ClusterMixin, BaseEstimator], embeddings: Docume
     def fit(self, corpus: Corpus, **kwargs):
         """Trains the model.
 
-        :param corpus: the flair corpus this wrapper will use for fitting the model.
+        Args:
+            corpus: the flair corpus this wrapper will use for fitting the model.
+            **kwargs: parameters propagated to the models `.fit()` method.
         """
         X = self._convert_dataset(corpus)
 
@@ -42,7 +45,8 @@ def fit(self, corpus: Corpus, **kwargs):
     def predict(self, corpus: Corpus):
         """Predict labels given a list of sentences and returns the respective class indices.
 
-        :param corpus: the flair corpus this wrapper will use for predicting the labels.
+        Args:
+            corpus: the flair corpus this wrapper will use for predicting the labels.
         """
         X = self._convert_dataset(corpus)
         log.info("Start the prediction " + str(self.model) + " with " + str(len(X)) + " Datapoints.")
@@ -57,7 +61,8 @@ def predict(self, corpus: Corpus):
     def save(self, model_file: Union[str, Path]):
         """Saves current model.
 
-        :param model_file: path where to save the model.
+        Args:
+            model_file: path where to save the model.
         """
         joblib.dump(pickle.dumps(self), str(model_file))
 
@@ -67,7 +72,8 @@ def save(self, model_file: Union[str, Path]):
     def load(model_file: Union[str, Path]):
         """Loads a model from a given path.
 
-        :param model_file: path to the file where the model is saved.
+        Args:
+            model_file: path to the file where the model is saved.
         """
         log.info("Loading model from: " + str(model_file))
         return pickle.loads(joblib.load(str(model_file)))
@@ -79,8 +85,6 @@ def _convert_dataset(
 
         Turns the corpora into X, y datasets as required for most sklearn clustering models.
         Ref.: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster
-
-        :param label_type: the label from sentences will be extracted. If the value is none this will be skipped.
         """
         log.info("Embed sentences...")
         sentences = []
@@ -107,8 +111,9 @@ def evaluate(self, corpus: Corpus, label_type: str):
 
         Also, the result of the evaluation is logged.
 
-        :param corpus: the flair corpus this wrapper will use for evaluation.
-        :param label_type: the label from the sentence will be used for the evaluation.
+        Args:
+            corpus: the flair corpus this wrapper will use for evaluation.
+            label_type: the label from the sentence will be used for the evaluation.
         """
         X, Y = self._convert_dataset(corpus, label_type=label_type)
         predict = self.model.predict(X)
diff --git a/flair/models/entity_linker_model.py b/flair/models/entity_linker_model.py
index ae4f2ecb0..3e46f3a2d 100644
--- a/flair/models/entity_linker_model.py
+++ b/flair/models/entity_linker_model.py
@@ -94,12 +94,16 @@ def __init__(
     ) -> None:
         """Initializes an EntityLinker.
 
-        :param embeddings: embeddings used to embed the words/sentences
-        :param label_dictionary: dictionary that gives ids to all classes. Should contain <unk>
-        :param pooling_operation: either 'average', 'first', 'last' or 'first&last'. Specifies the way of how text representations of entity mentions (with more than one word) are handled.
-        E.g. 'average' means that as text representation we take the average of the embeddings of the words in the mention. 'first&last' concatenates
-        the embedding of the first and the embedding of the last word.
-        :param label_type: name of the label you use.
+        Args:
+            embeddings: embeddings used to embed the tokens of the sentences.
+            label_dictionary: dictionary that gives ids to all classes. Should contain <unk>.
+            pooling_operation: either `average`, `first`, `last` or `first_last`. Specifies the way of how text
+                representations of entity mentions (with more than one token) are handled. E.g. `average` means that as
+                text representation we take the average of the embeddings of the token in the mention.
+                `first_last` concatenates the embedding of the first and the embedding of the last token.
+            label_type: name of the label you use.
+            candidates: If provided, use a :class:`CandidateGenerator` for prediction candidates.
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         super().__init__(
             embeddings=embeddings,
diff --git a/flair/models/lemmatizer_model.py b/flair/models/lemmatizer_model.py
index dfeef8e9b..6700b089d 100644
--- a/flair/models/lemmatizer_model.py
+++ b/flair/models/lemmatizer_model.py
@@ -40,22 +40,28 @@ def __init__(
         that predicts the lemma of the given token one letter at a time.
         Note that one can use data in which only those words are annotated that differ from their lemma or data
         in which all words are annotated with a (maybe equal) lemma.
-        :param embeddings: Embedding used to encode sentence
-        :param rnn_input_size: Input size of the RNN('s). Each letter of a token is represented by a hot-one-vector
-            over the given character dictionary. This vector is transformed to a input_size vector with a linear layer.
-        :param rnn_hidden_size: size of the hidden state of the RNN('s).
-        :param rnn_layers: Number of stacked RNN cells
-        :param beam_size: Number of hypothesis used when decoding the output of the RNN. Only used in prediction.
-        :param char_dict: Dictionary of characters the model is able to process. The dictionary must contain <unk> for
-            the handling of unknown characters. If None, a standard dictionary will be loaded. One can either hand
-            over a path to a dictionary or the dictionary itself.
-        :param label_type: Name of the gold labels to use.
-        :param max_sequence_length_dependent_on_input: If set to True, the maximum length of a decoded sequence in
-            the prediction depends on the sentences you want to lemmatize. To be precise the maximum length is
-            computed as the length of the longest token in the sentences plus one.
-        :param max_sequence_length: If set to True and max_sequence_length_dependend_on_input is False a fixed
-            maximum length for the decoding will be used for all sentences.
-        :param use_attention: whether to use attention. Only sensible if encoding via RNN
+
+        Args:
+            encode_characters: If True, use a character embedding to additionally encode tokens per character.
+            start_symbol_for_encoding: If True, use a start symbol for encoding characters.
+            end_symbol_for_encoding: If True, use an end symbol for encoding characters.
+            bidirectional_encoding: If True, the character encoding is bidirectional.
+            embeddings: Embedding used to encode sentence
+            rnn_input_size: Input size of the RNN('s). Each letter of a token is represented by a hot-one-vector over
+                the given character dictionary. This vector is transformed to a input_size vector with a linear layer.
+            rnn_hidden_size: size of the hidden state of the RNN('s).
+            rnn_layers: Number of stacked RNN cells
+            beam_size: Number of hypothesis used when decoding the output of the RNN. Only used in prediction.
+            char_dict: Dictionary of characters the model is able to process. The dictionary must contain <unk> for
+                the handling of unknown characters. If None, a standard dictionary will be loaded. One can either hand
+                over a path to a dictionary or the dictionary itself.
+            label_type: Name of the gold labels to use.
+            max_sequence_length_dependent_on_input: If set to True, the maximum length of a decoded sequence in
+                the prediction depends on the sentences you want to lemmatize. To be precise the maximum length is
+                computed as the length of the longest token in the sentences plus one.
+            max_sequence_length: If set to True and max_sequence_length_dependend_on_input is False a fixed
+                maximum length for the decoding will be used for all sentences.
+            use_attention: whether to use attention. Only sensible if encoding via RNN
         """
         super().__init__()
 
@@ -161,13 +167,17 @@ def words_to_char_indices(
     ):
         """For a given list of strings this function creates index vectors that represent the characters of the strings.
 
-        Each string is represented by sequence_length (maximum string length + entries for special symbold) many
+        Each string is represented by sequence_length (maximum string length + entries for special symbol) many
         indices representing characters in self.char_dict.
         One can manually set the vector length with the parameter seq_length, though the vector length is always
         at least maximum string length in the list.
-        :param end_symbol: add self.end_index at the end of each representation
-        :param start_symbol: add self.start_index in front of of each representation
-        :param padding_in_front: whether to fill up with self.dummy_index in front or in back of strings
+
+        Args:
+            seq_length: the maximum sequence length to use, if None the maximum is taken..
+            tokens: the texts of the toekens to encode
+            end_symbol: add self.end_index at the end of each representation
+            start_symbol: add self.start_index in front of each representation
+            padding_in_front: whether to fill up with self.dummy_index in front or in back of strings
         """
         # add additional columns for special symbols if necessary
         c = int(end_symbol) + int(start_symbol)
@@ -403,15 +413,14 @@ def predict(
     ):
         """Predict lemmas of words for a given (list of) sentence(s).
 
-        :param sentences: sentences to predict
-        :param label_name: label name used for predicted lemmas
-        :param mini_batch_size: number of tokens that are send through the RNN simultaneously, assuming batching_in_rnn
-            is set to True
-        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
-            you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
-        :param return_loss: whether or not to compute and return loss. Setting it to True only makes sense if labels
-            are provided
-        :param verbose: If True, lemmatized sentences will be printed in the console.
+        Args:
+            sentences: sentences to predict
+            label_name: label name used for predicted lemmas
+            mini_batch_size: number of tokens that are send through the RNN simultaneously, assuming batching_in_rnn is set to True
+            embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
+            return_loss: whether to compute and return loss. Setting it to True only makes sense if labels are provided
+            verbose: If True, lemmatized sentences will be printed in the console.
+            return_probabilities_for_all_classes: unused parameter.
         """
         if isinstance(sentences, Sentence):
             sentences = [sentences]
diff --git a/flair/models/multitask_model.py b/flair/models/multitask_model.py
index a68c69862..d7fb262da 100644
--- a/flair/models/multitask_model.py
+++ b/flair/models/multitask_model.py
@@ -6,7 +6,7 @@
 import torch
 
 import flair.nn
-from flair.data import DT, Dictionary, Sentence
+from flair.data import DT, Sentence
 from flair.file_utils import cached_path
 from flair.nn import Classifier
 from flair.training_utils import Result
@@ -33,7 +33,11 @@ def __init__(
     ) -> None:
         """Instantiates the MultiTaskModel.
 
-        :param models: Key (Task ID) - Value (flair.nn.Model) Pairs to stack model
+        Args:
+            models: The child models used during multitask training.
+            task_ids: If given, add each corresponding model a specified task id. Otherwise, tasks get the ids 'Task_0', 'Task_1', ...
+            loss_factors: If given, weight the losses of teh corresponding models during training.
+            use_all_tasks: If True, each sentence will be trained on all tasks parallel, otherwise each epoch 1 task will be sampled to train the sentence on.
         """
         super().__init__()
 
@@ -64,8 +68,10 @@ def _prepare_tensors(self, data_points: List[DT]) -> Tuple[torch.Tensor, ...]:
     def forward_loss(self, sentences: Union[List[Sentence], Sentence]) -> Tuple[torch.Tensor, int]:
         """Calls the respective forward loss of each model and sums them weighted by their loss factors.
 
-        :param sentences: batch of sentences
-        :return: loss
+        Args:
+            sentences: batch of sentences
+
+        Returns: loss and sample count
         """
         batch_split = self.split_batch_to_task_ids(sentences, all_tasks=self.use_all_tasks)
         loss = torch.tensor(0.0, device=flair.device)
@@ -90,9 +96,12 @@ def split_batch_to_task_ids(sentences: Union[List[Sentence], Sentence], all_task
 
         If single sentence is assigned to several tasks (i.e. same corpus but different tasks), then the model
         assignment for this batch is randomly chosen.
-        :param sentences: batch of sentences
-        :param all_tasks: use all tasks of each sentence. If deactivated, a random task will be sampled
-        :return: Key-value pairs as (task_id, list of sentences ids in batch)
+
+        Args:
+            sentences: batch of sentences
+            all_tasks: use all tasks of each sentence. If deactivated, a random task will be sampled
+
+        Returns: Key-value pairs as (task_id, list of sentences ids in batch)
         """
         batch_to_task_mapping: Dict[str, List[int]] = {}
         for sentence_id, sentence in enumerate(sentences):
@@ -107,28 +116,26 @@ def split_batch_to_task_ids(sentences: Union[List[Sentence], Sentence], all_task
                     batch_to_task_mapping[multitask_id.value] = [sentence_id]
         return batch_to_task_mapping
 
-    def evaluate(
+    def evaluate(  # type: ignore[override]
         self,
         data_points,
         gold_label_type: str,
         out_path: Optional[Union[str, Path]] = None,
-        embedding_storage_mode: str = "none",
-        mini_batch_size: int = 32,
         main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
-        exclude_labels: List[str] = [],
-        gold_label_dictionary: Optional[Dictionary] = None,
-        return_loss: bool = True,
         evaluate_all: bool = True,
         **evalargs,
     ) -> Result:
         """Evaluates the model. Returns a Result object containing evaluation results and a loss value.
 
-        :param sentences: batch of sentences
-        :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
-            'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)
-        :param mini_batch_size: size of batches
-        :param evaluate_all: choose if all tasks should be evaluated, or a single one, depending on gold_label_type
-        :return: Tuple of Result object and loss value (float)
+        Args:
+            data_points: batch of sentences
+            gold_label_type: if evaluate_all is False, specify the task to evaluate by the task_id.
+            out_path: if not None, predictions will be created and saved at the respective file.
+            main_evaluation_metric: Specify which metric to highlight as main_score
+            evaluate_all: choose if all tasks should be evaluated, or a single one, depending on gold_label_type
+            **evalargs: arguments propagated to :meth:`flair.nn.Model.evaluate`
+
+        Returns: Tuple of Result object and loss value (float)
         """
         if not evaluate_all:
             if gold_label_type not in self.tasks:
@@ -144,12 +151,7 @@ def evaluate(
                 data,
                 gold_label_type=self.tasks[gold_label_type].label_type,
                 out_path=out_path,
-                embedding_storage_mode=embedding_storage_mode,
-                mini_batch_size=mini_batch_size,
                 main_evaluation_metric=main_evaluation_metric,
-                exclude_labels=exclude_labels,
-                gold_label_dictionary=gold_label_dictionary,
-                return_loss=return_loss,
                 **evalargs,
             )
 
@@ -165,12 +167,7 @@ def evaluate(
                 data_points=[data_points[i] for i in split],
                 gold_label_type=self.tasks[task_id].label_type,
                 out_path=f"{out_path}_{task_id}.txt" if out_path is not None else None,
-                embedding_storage_mode=embedding_storage_mode,
-                mini_batch_size=mini_batch_size,
                 main_evaluation_metric=main_evaluation_metric,
-                exclude_labels=exclude_labels,
-                gold_label_dictionary=gold_label_dictionary,
-                return_loss=return_loss,
                 **evalargs,
             )
 
@@ -204,10 +201,7 @@ def evaluate(
         )
 
     def _get_state_dict(self):
-        """Returns the state dict of the multitask model which has multiple models underneath.
-
-        :return model_state: model state for the multitask model
-        """
+        """Returns the state dict of the multitask model which has multiple models underneath."""
         initial_model_state = super()._get_state_dict()
         initial_model_state["state_dict"] = {}  # the model state is stored per model already.
         model_state = {
diff --git a/flair/models/pairwise_classification_model.py b/flair/models/pairwise_classification_model.py
index e6598d87a..2d8f76842 100644
--- a/flair/models/pairwise_classification_model.py
+++ b/flair/models/pairwise_classification_model.py
@@ -26,13 +26,18 @@ def __init__(
     ) -> None:
         """Initializes a TextPairClassifier.
 
-        :param embeddings: embeddings used to embed each data point
-        :param label_dictionary: dictionary of labels you want to predict
-        :param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction
-        or False to force single-label prediction
-        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
-        :param loss_weights: Dictionary of weights for labels for the loss function
-        (if any label's weight is unspecified it will default to 1.0)
+        Args:
+            label_type: label_type: name of the label
+            embed_separately: if True, the sentence embeddings will be concatenated,
+              if False both sentences will be combined and newly embedded.
+            embeddings: embeddings used to embed each data point
+            label_dictionary: dictionary of labels you want to predict
+            multi_label: auto-detected by default, but you can set this to True to force multi-label prediction
+               or False to force single-label prediction
+            multi_label_threshold: If multi-label you can set the threshold to make predictions
+            loss_weights: Dictionary of weights for labels for the loss function.
+              If any label's weight is unspecified it will default to 1.0
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         super().__init__(
             **classifierargs,
diff --git a/flair/models/pairwise_regression_model.py b/flair/models/pairwise_regression_model.py
index 0cec52070..1841e0767 100644
--- a/flair/models/pairwise_regression_model.py
+++ b/flair/models/pairwise_regression_model.py
@@ -33,12 +33,19 @@ def __init__(
         locked_dropout: float = 0.0,
         word_dropout: float = 0.0,
         decoder: Optional[torch.nn.Module] = None,
-        **classifierargs,
     ) -> None:
         """Initialize the Text Pair Regression Model.
 
-        :param embeddings: embeddings used to embed each data point
-        :param label_name:
+        Args:
+            label_type: name of the label
+            embed_separately: if True, the sentence embeddings will be concatenated,
+              if False both sentences will be combined and newly embedded.
+            dropout: dropout
+            locked_dropout: locked_dropout
+            word_dropout:  word_dropout
+            decoder: if provided, a that specific layer will be used as decoder,
+              otherwise a linear layer with random parameters will be created.
+            embeddings: embeddings used to embed each data point
         """
         super().__init__()
 
diff --git a/flair/models/regexp_tagger.py b/flair/models/regexp_tagger.py
index a6b7f6c80..35c244d96 100644
--- a/flair/models/regexp_tagger.py
+++ b/flair/models/regexp_tagger.py
@@ -10,7 +10,8 @@
 class TokenCollection:
     """A utility class for RegexpTagger to hold all tokens for a given Sentence and define some functionality.
 
-    :param sentence: A Sentence object
+    Args:
+        sentence: A Sentence object
     """
 
     sentence: Sentence
@@ -33,8 +34,10 @@ def get_token_span(self, span: Tuple[int, int]) -> Span:
         spanning the tokens included in the interval. If the interval is overlapping with a token span, a
         ValueError is raised
 
-        :param span: Start and end pos of the requested span as tuple
-        :return: A span object spanning the requested token interval
+        Args:
+            span: Start and end pos of the requested span as tuple
+
+        Returns: A span object spanning the requested token interval
         """
         span_start: int = self.__tokens_start_pos.index(span[0])
         span_end: int = self.__tokens_end_pos.index(span[1])
@@ -52,7 +55,8 @@ def __init__(self, mapping: Union[List[Tuple[str, str]], Tuple[str, str]]) -> No
 
         If a match violates (in this case overlaps) a token span, an exception is raised.
 
-        :param mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
+        Args:
+            mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
         """
         self._regexp_mapping: Dict[str, typing.Pattern] = {}
         self.register_labels(mapping=mapping)
@@ -64,7 +68,8 @@ def registered_labels(self):
     def register_labels(self, mapping: Union[List[Tuple[str, str]], Tuple[str, str]]):
         """Register a regexp -> label mapping.
 
-        :param mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
+        Args:
+            mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
         """
         mapping = self._listify(mapping)
 
@@ -79,7 +84,8 @@ def register_labels(self, mapping: Union[List[Tuple[str, str]], Tuple[str, str]]
     def remove_labels(self, labels: Union[List[str], str]):
         """Remove a registered regexp -> label mapping given by label.
 
-        :param labels: A list of labels or a single label as strings.
+        Args:
+            labels: A list of labels or a single label as strings.
         """
         labels = self._listify(labels)
 
diff --git a/flair/models/relation_classifier_model.py b/flair/models/relation_classifier_model.py
index 43b7dc203..c819dd135 100644
--- a/flair/models/relation_classifier_model.py
+++ b/flair/models/relation_classifier_model.py
@@ -219,18 +219,20 @@ class _Entity(NamedTuple):
 class RelationClassifier(flair.nn.DefaultClassifier[EncodedSentence, EncodedSentence]):
     """Relation Classifier to predict the relation between two entities.
 
-    ---- Task ----
+    Task
+    ----
     Relation Classification (RC) is the task of identifying the semantic relation between two entities in a text.
     In contrast to (end-to-end) Relation Extraction (RE), RC requires pre-labelled entities.
 
     Example:
-    -------
+    --------
     For the `founded_by` relation from `ORG` (head) to `PER` (tail) and the sentence
     "Larry Page and Sergey Brin founded Google .", we extract the relations
     - founded_by(head='Google', tail='Larry Page') and
     - founded_by(head='Google', tail='Sergey Brin').
 
-    ---- Architecture ----
+    Architecture
+    ------------
     The Relation Classifier Model builds upon a text classifier.
     The model generates an encoded sentence for each entity pair
     in the cross product of all entities in the original sentence.
@@ -241,7 +243,9 @@ class RelationClassifier(flair.nn.DefaultClassifier[EncodedSentence, EncodedSent
 
     The implemented encoding strategies are taken from this paper by Zhou et al.: https://arxiv.org/abs/2102.01373
 
-    Note: Currently, the model has no multi-label support.
+    .. warning::
+        Currently, the model has no multi-label support.
+
     """
 
     def __init__(
@@ -260,34 +264,18 @@ def __init__(
     ) -> None:
         """Initializes a `RelationClassifier`.
 
-        :param embeddings: The document embeddings used to embed each sentence
-        :param label_dictionary: A Dictionary containing all predictable labels from the corpus
-        :param label_type: The label type which is going to be predicted, in case a corpus has multiple annotations
-        :param entity_label_types: A label type or sequence of label types of the required relation entities.
-                                   You can also specify a label filter in a dictionary with the label type as key and
-                                   the valid entity labels as values in a set.
-                                   E.g. to use only 'PER' and 'ORG' labels from a NER-tagger: `{'ner': {'PER', 'ORG'}}`.
-                                   To use all labels from 'ner', pass 'ner'.
-        :param entity_pair_labels: A set of valid relation entity pair combinations, used as relation candidates.
-                                   Specify valid entity pairs in a set of tuples of labels (<HEAD>, <TAIL>).
-                                   E.g. for the `born_in` relation, only relations from 'PER' to 'LOC' make sense.
-                                   Here, relations from 'PER' to 'PER' are not meaningful, so
-                                   it is advised to specify the `entity_pair_labels` as `{('PER', 'ORG')}`.
-                                   This setting may help to reduce the number of relation candidates.
-                                   Leaving this parameter as `None` (default) disables the relation-candidate-filter,
-                                   i.e. the model classifies the relation for each entity pair
-                                   in the cross product of *all* entity pairs (inefficient).
-        :param entity_threshold: Only pre-labelled entities above this threshold are taken into account by the model.
-        :param cross_augmentation: If `True`, use cross augmentation to transform `Sentence`s into `EncodedSentenece`s.
-                                   When cross augmentation is enabled, the transformation functions,
-                                   e.g. `transform_corpus`, generate an encoded sentence for each entity pair
-                                   in the cross product of all entities in the original sentence.
-                                   When disabling cross augmentation, the transform functions only generate
-                                   encoded sentences for each gold relation annotation in the original sentence.
-        :param encoding_strategy: An instance of a class conforming the :class:`EncodingStrategy` protocol
-        :param zero_tag_value: The label to use for out-of-class relations
-        :param allow_unk_tag: If `False`, removes `<unk>` from the passed label dictionary, otherwise do nothing.
-        :param classifierargs: The remaining parameters passed to the underlying `DefaultClassifier`
+        Args:
+            embeddings: The document embeddings used to embed each sentence
+            label_dictionary: A Dictionary containing all predictable labels from the corpus
+            label_type: The label type which is going to be predicted, in case a corpus has multiple annotations
+            entity_label_types: A label type or sequence of label types of the required relation entities. You can also specify a label filter in a dictionary with the label type as key and the valid entity labels as values in a set. E.g. to use only 'PER' and 'ORG' labels from a NER-tagger: `{'ner': {'PER', 'ORG'}}`. To use all labels from 'ner', pass 'ner'.
+            entity_pair_labels: A set of valid relation entity pair combinations, used as relation candidates. Specify valid entity pairs in a set of tuples of labels (<HEAD>, <TAIL>). E.g. for the `born_in` relation, only relations from 'PER' to 'LOC' make sense. Here, relations from 'PER' to 'PER' are not meaningful, so it is advised to specify the `entity_pair_labels` as `{('PER', 'ORG')}`. This setting may help to reduce the number of relation candidates. Leaving this parameter as `None` (default) disables the relation-candidate-filter, i.e. the model classifies the relation for each entity pair in the cross product of *all* entity pairs (inefficient).
+            entity_threshold: Only pre-labelled entities above this threshold are taken into account by the model.
+            cross_augmentation: If `True`, use cross augmentation to transform `Sentence`s into `EncodedSentenece`s. When cross augmentation is enabled, the transformation functions, e.g. `transform_corpus`, generate an encoded sentence for each entity pair in the cross product of all entities in the original sentence. When disabling cross augmentation, the transform functions only generate  encoded sentences for each gold relation annotation in the original sentence.
+            encoding_strategy: An instance of a class conforming the :class:`EncodingStrategy` protocol
+            zero_tag_value: The label to use for out-of-class relations
+            allow_unk_tag: If `False`, removes `<unk>` from the passed label dictionary, otherwise do nothing.
+            classifierargs: The remaining parameters passed to the underlying :class:`flair.models.DefaultClassifier`
         """
         # Set label type and prepare label dictionary
         self._label_type = label_type
@@ -341,10 +329,13 @@ def __init__(
         self.to(flair.device)
 
     def _valid_entities(self, sentence: Sentence) -> Iterator[_Entity]:
-        """Yields all valid entities, filtered under the specification of `self.entity_label_types`.
+        """Yields all valid entities, filtered under the specification of :attr:`~entity_label_types`.
+
+        Args:
+            sentence: A Sentence object with entity annotations
 
-        :param sentence: A flair `Sentence` object with entity annotations
-        :return: Valid entities as `_Entity`
+        Yields:
+            Valid entities as `_Entity`
         """
         for label_type, valid_labels in self.entity_label_types.items():
             for entity_span in sentence.get_spans(label_type=label_type):
@@ -369,11 +360,14 @@ def _entity_pair_permutations(
         If the passed sentence contains relation annotations,
         the relation gold label will be yielded along with the participating entities.
         The permutations are constructed by a filtered cross-product
-        under the specification of `self.entity_label_types` and `self.entity_pair_labels`.
+        under the specification of :py:meth:~`flair.models.RelationClassifier.entity_label_types`
+        and :py:meth:~`flair.models.RelationClassifier.entity_pair_labels`.
 
-        :param sentence: A flair `Sentence` object with entity annotations
-        :yields: Tuples of (HEAD, TAIL, gold_label).
-                 The head and tail `_Entity`s have span references to the passed sentence.
+        Args:
+            sentence: A Sentence with entity annotations
+
+        Yields:
+            Tuples of (HEAD, TAIL, gold_label): The head and tail `_Entity`s` have span references to the passed sentence.
         """
         valid_entities: List[_Entity] = list(self._valid_entities(sentence))
 
@@ -409,14 +403,16 @@ def _encode_sentence(
         tail: _Entity,
         gold_label: Optional[str] = None,
     ) -> EncodedSentence:
-        """Returns a new `Sentence` object with masked/marked head and tail spans according to the encoding strategy.
+        """Returns a new Sentence object with masked/marked head and tail spans according to the encoding strategy.
+
+        If provided, the encoded sentence also has the corresponding gold label annotation from :attr:`~label_type`.
 
-        If provided, the encoded sentence also has the corresponding gold label annotation from `self.label_type`.
+        Args:
+            head: The head Entity
+            tail: The tail Entity
+            gold_label: An optional gold label of the induced relation by the head and tail entity
 
-        :param head: The head `_Entity`
-        :param tail: The tail `_Entity`
-        :param gold_label: An optional gold label of the induced relation by the head and tail entity
-        :return: The `EncodedSentence` (with gold annotations)
+        Returns: The EncodedSentence with Gold Annotations
         """
         # Some sanity checks
         original_sentence: Sentence = head.span.sentence
@@ -471,9 +467,10 @@ def _encode_sentence_for_inference(
               **exactly** one induced relation annotation, the gold annotation or `self.zero_tag_value`.
             - The created relations have head and tail spans from the original passed sentence.
 
-        :param sentence: A flair `Sentence` object with entity annotations
-        :return: Encoded sentences annotated with their gold relation and
-                 the corresponding relation in the original sentence
+        Args:
+            sentence: A flair `Sentence` object with entity annotations
+
+        Returns: Encoded sentences annotated with their gold relation and the corresponding relation in the original sentence
         """
         for head, tail, gold_label in self._entity_pair_permutations(sentence):
             masked_sentence: EncodedSentence = self._encode_sentence(
@@ -511,11 +508,14 @@ def transform_sentence(self, sentences: Union[Sentence, List[Sentence]]) -> List
         """Transforms sentences into encoded sentences specific to the `RelationClassifier`.
 
         For more information on the internal sentence transformation procedure,
-        see the :class:`RelationClassifier` architecture and
-        the different :class:`EncodingStrategy` variants docstrings.
+        see the :class:`flair.models.RelationClassifier` architecture and
+        the different :class:`flair.models.relation_classifier_model.EncodingStrategy` variants docstrings.
+
+        Args:
+            sentences: sentences to transform
 
-        :param sentences: A (list) of sentence(s) to transform
-        :return: A list of encoded sentences specific to the `RelationClassifier`
+        Returns:
+            A list of encoded sentences specific to the `RelationClassifier`
         """
         if not isinstance(sentences, list):
             sentences = [sentences]
@@ -534,8 +534,10 @@ def transform_dataset(self, dataset: Dataset[Sentence]) -> FlairDatapointDataset
         see the :class:`RelationClassifier` architecture and
         the different :class:`EncodingStrategy` variants docstrings.
 
-        :param dataset: A dataset of sentences to transform
-        :return: A dataset of encoded sentences specific to the `RelationClassifier`
+        Args:
+            dataset: A dataset of sentences to transform
+
+        Returns: A dataset of encoded sentences specific to the `RelationClassifier`
         """
         data_loader: DataLoader = DataLoader(dataset, batch_size=1)
         original_sentences: List[Sentence] = [batch[0] for batch in iter(data_loader)]
@@ -549,8 +551,10 @@ def transform_corpus(self, corpus: Corpus[Sentence]) -> Corpus[EncodedSentence]:
         see the :class:`RelationClassifier` architecture and
         the different :class:`EncodingStrategy` variants docstrings.
 
-        :param corpus: A corpus of sentences to transform
-        :return: A corpus of encoded sentences specific to the `RelationClassifier`
+        Args:
+            corpus: A corpus of sentences to transform
+
+        Returns: A corpus of encoded sentences specific to the `RelationClassifier`
         """
         return Corpus(
             train=self.transform_dataset(corpus.train) if corpus.train is not None else None,
@@ -605,16 +609,16 @@ def predict(
         Standard `Sentence` objects and `EncodedSentences` specific to the `RelationClassifier` are allowed as input.
         The (relation) labels are directly added to the sentences.
 
-        :param sentences: A list of (encoded) sentences.
-        :param mini_batch_size: The mini batch size to use
-        :param return_probabilities_for_all_classes: Return probabilities for all classes instead of only best predicted
-        :param verbose: Set to display a progress bar
-        :param return_loss: Set to return loss
-        :param label_name: Set to change the predicted label type name
-        :param embedding_storage_mode: The default is 'none', which is always best.
-                                       Only set to 'cpu' or 'gpu' if you wish to predict
-                                       and keep the generated embeddings in CPU or GPU memory, respectively.
-        :return: The loss and the total number of classes, if `return_loss` is set
+        Args:
+            sentences: A list of (encoded) sentences.
+            mini_batch_size: The mini batch size to use
+            return_probabilities_for_all_classes: Return probabilities for all classes instead of only best predicted
+            verbose: Set to display a progress bar
+            return_loss: Set to return loss
+            label_name: Set to change the predicted label type name
+            embedding_storage_mode: The default is 'none', which is always best. Only set to 'cpu' or 'gpu' if you wish to predict and keep the generated embeddings in CPU or GPU memory, respectively.
+
+        Returns: The loss and the total number of classes, if `return_loss` is set
         """
         prediction_label_type: str = self.label_type if label_name is None else label_name
 
diff --git a/flair/models/relation_extractor_model.py b/flair/models/relation_extractor_model.py
index 83e063b72..795e8a517 100644
--- a/flair/models/relation_extractor_model.py
+++ b/flair/models/relation_extractor_model.py
@@ -25,12 +25,15 @@ def __init__(
     ) -> None:
         """Initializes a RelationClassifier.
 
-        :param document_embeddings: embeddings used to embed each data point
-        :param label_dictionary: dictionary of labels you want to predict
-        :param beta: Parameter for F-beta score for evaluation and training annealing
-        :param train_on_gold_pairs_only: Set true to not train to predict no relation.
-        :param loss_weights: Dictionary of weights for labels for the loss function
-            (if any label's weight is unspecified it will default to 1.0)
+        Args:
+            embeddings: embeddings used to embed each data point
+            label_type: name of the label
+            entity_label_type: name of the labels used to represent entities
+            entity_pair_filters: if provided, only classify pairs that apply the filter
+            pooling_operation: either "first" or "first_last" how the embeddings of the entities
+              should be used to create relation embeddings
+            train_on_gold_pairs_only: if True, relations with "O" (no relation) label will be ignored in training.
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         # pooling operation to get embeddings for entites
         self.pooling_operation = pooling_operation
diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
index 32d0ec11b..c6defd24a 100644
--- a/flair/models/sequence_tagger_model.py
+++ b/flair/models/sequence_tagger_model.py
@@ -48,27 +48,27 @@ def __init__(
         """Sequence Tagger class for predicting labels for single tokens. Can be parameterized by several attributes.
 
         In case of multitask learning, pass shared embeddings or shared rnn into respective attributes.
-        :param embeddings: Embeddings to use during training and prediction
-        :param tag_dictionary: Dictionary containing all tags from corpus which can be predicted
-        :param tag_type: type of tag which is going to be predicted in case a corpus has multiple annotations
-        :param use_rnn: If true, use a RNN, else Linear layer.
-        :param rnn: (Optional) Takes a torch.nn.Module as parameter by which you can pass a shared RNN between
-            different tasks.
-        :param rnn_type: Specifies the RNN type to use, default is 'LSTM', can choose between 'GRU' and 'RNN' as well.
-        :param hidden_size: Hidden size of RNN layer
-        :param rnn_layers: number of RNN layers
-        :param bidirectional: If True, RNN becomes bidirectional
-        :param use_crf: If True, use a Conditional Random Field for prediction, else linear map to tag space.
-        :param reproject_embeddings: If True, add a linear layer on top of embeddings, if you want to imitate
-            fine tune non-trainable embeddings.
-        :param dropout: If > 0, then use dropout.
-        :param word_dropout: If > 0, then use word dropout.
-        :param locked_dropout: If > 0, then use locked dropout.
-        :param train_initial_hidden_state: if True, trains initial hidden state of RNN
-        :param loss_weights: Dictionary of weights for labels for the loss function
-            (if any label's weight is unspecified it will default to 1.0)
-        :param init_from_state_dict: Indicator whether we are loading a model from state dict
-            since we need to transform previous models' weights into CRF instance weights
+
+        Args:
+            embeddings: Embeddings to use during training and prediction
+            tag_dictionary: Dictionary containing all tags from corpus which can be predicted
+            tag_type: type of tag which is going to be predicted in case a corpus has multiple annotations
+            use_rnn: If true, use a RNN, else Linear layer.
+            rnn: Takes a torch.nn.Module as parameter by which you can pass a shared RNN between different tasks.
+            rnn_type: Specifies the RNN type to use, default is 'LSTM', can choose between 'GRU' and 'RNN' as well.
+            hidden_size: Hidden size of RNN layer
+            rnn_layers: number of RNN layers
+            bidirectional: If True, RNN becomes bidirectional
+            use_crf: If True, use a Conditional Random Field for prediction, else linear map to tag space.
+            reproject_embeddings: If True, add a linear layer on top of embeddings, if you want to imitate fine tune non-trainable embeddings.
+            dropout: If > 0, then use dropout.
+            word_dropout: If > 0, then use word dropout.
+            locked_dropout: If > 0, then use locked dropout.
+            train_initial_hidden_state: if True, trains initial hidden state of RNN
+            loss_weights: Dictionary of weights for labels for the loss function. If any label's weight is unspecified it will default to 1.0.
+            init_from_state_dict: Indicator whether we are loading a model from state dict since we need to transform previous models' weights into CRF instance weights
+            allow_unk_predictions: If True, allows spans to predict <unk> too.
+            tag_format: the format to encode spans as tags, either "BIO" or "BIOES"
         """
         super().__init__()
 
@@ -208,7 +208,8 @@ def label_type(self):
     def _init_loss_weights(self, loss_weights: Dict[str, float]) -> torch.Tensor:
         """Initializes the loss weights based on given dictionary.
 
-        :param loss_weights: dictionary - contains loss weights
+        Args:
+            loss_weights: dictionary - contains loss weights
         """
         n_classes = len(self.label_dictionary)
         weight_list = [1.0 for _ in range(n_classes)]
@@ -221,7 +222,8 @@ def _init_loss_weights(self, loss_weights: Dict[str, float]) -> torch.Tensor:
     def _init_initial_hidden_state(self, num_directions: int):
         """Initializes hidden states given the number of directions in RNN.
 
-        :param num_directions: Number of directions in RNN.
+        Args:
+            num_directions: Number of directions in RNN.
         """
         hs_initializer = torch.nn.init.xavier_normal_
         lstm_init_h = torch.nn.Parameter(
@@ -245,11 +247,12 @@ def RNN(
     ) -> torch.nn.RNN:
         """Static wrapper function returning an RNN instance from PyTorch.
 
-        :param rnn_type: Type of RNN from torch.nn
-        :param rnn_layers: number of layers to include
-        :param hidden_size: hidden size of RNN cell
-        :param bidirectional: If True, RNN cell is bidirectional
-        :param rnn_input_dim: Input dimension to RNN cell
+        Args:
+            rnn_type: Type of RNN from torch.nn
+            rnn_layers: number of layers to include
+            hidden_size: hidden size of RNN cell
+            bidirectional: If True, RNN cell is bidirectional
+            rnn_input_dim: Input dimension to RNN cell
         """
         if rnn_type in ["LSTM", "GRU", "RNN"]:
             RNN = getattr(torch.nn, rnn_type)(
@@ -291,8 +294,9 @@ def _prepare_tensors(self, data_points: Union[List[Sentence], Sentence]) -> Tupl
     def forward(self, sentence_tensor: torch.Tensor, lengths: torch.LongTensor):
         """Forward propagation through network.
 
-        :param sentence_tensor: A tensor representing the batch of sentences.
-        :param lengths: A IntTensor representing the lengths of the respective sentences.
+        Args:
+            sentence_tensor: A tensor representing the batch of sentences.
+            lengths: A IntTensor representing the lengths of the respective sentences.
         """
         if self.use_dropout:
             sentence_tensor = self.dropout(sentence_tensor)
@@ -367,8 +371,10 @@ def _get_scores_from_features(features: torch.Tensor, lengths: torch.Tensor):
 
         Trims current batch tensor in shape (batch size, sequence length, tagset size)
         in such a way that all pads are going to be removed.
-        :param features: torch.tensor containing all features from forward propagation
-        :param lengths: length from each sentence in batch in order to trim padding tokens
+
+        Args:
+            features: all features from forward propagation
+            lengths: length from each sentence in batch in order to trim padding tokens
         """
         features_formatted = []
         for feat, length in zip(features, lengths):
@@ -380,7 +386,8 @@ def _get_scores_from_features(features: torch.Tensor, lengths: torch.Tensor):
     def _get_gold_labels(self, sentences: List[Sentence]) -> List[str]:
         """Extracts gold labels from each sentence.
 
-        :param sentences: List of sentences in batch
+        Args:
+            sentences: List of sentences in batch
         """
         # spans need to be encoded as token-level predictions
         if self.predict_spans:
@@ -432,13 +439,15 @@ def predict(
     ):
         """Predicts labels for current batch with CRF or Softmax.
 
-        :param sentences: List of sentences in batch
-        :param mini_batch_size: batch size for test data
-        :param return_probabilities_for_all_classes: Whether to return probabilities for all classes
-        :param verbose: whether to use progress bar
-        :param label_name: which label to predict
-        :param return_loss: whether to return loss value
-        :param embedding_storage_mode: determines where to store embeddings - can be "gpu", "cpu" or None.
+        Args:
+            sentences: List of sentences in batch
+            mini_batch_size: batch size for test data
+            return_probabilities_for_all_classes: Whether to return probabilities for all classes
+            verbose: whether to use progress bar
+            label_name: which label to predict
+            return_loss: whether to return loss value
+            embedding_storage_mode: determines where to store embeddings - can be "gpu", "cpu" or None.
+            force_token_predictions: add labels per token instead of span labels, even if `self.predict_spans` is True
         """
         if label_name is None:
             label_name = self.tag_type
@@ -534,9 +543,10 @@ def predict(
     def _standard_inference(self, features: torch.Tensor, batch: List[Sentence], probabilities_for_all_classes: bool):
         """Softmax over emission scores from forward propagation.
 
-        :param features: sentence tensor from forward propagation
-        :param batch: list of sentence
-        :param probabilities_for_all_classes: whether to return score for each tag in tag dictionary
+        Args:
+            features: sentence tensor from forward propagation
+            batch: sentences
+            probabilities_for_all_classes: whether to return score for each tag in tag dictionary
         """
         softmax_batch = F.softmax(features, dim=1).cpu()
         scores_batch, prediction_batch = torch.max(softmax_batch, dim=1)
@@ -562,10 +572,7 @@ def _standard_inference(self, features: torch.Tensor, batch: List[Sentence], pro
         return predictions, all_tags
 
     def _all_scores_for_token(self, sentences: List[Sentence], scores: torch.Tensor, lengths: List[int]):
-        """Returns all scores for each tag in tag dictionary.
-
-        :param scores: Scores for current sentence.
-        """
+        """Returns all scores for each tag in tag dictionary."""
         scores = scores.numpy()
         tokens = [token for sentence in sentences for token in sentence]
         prob_all_tags = [
@@ -918,11 +925,13 @@ def push_to_hub(
     ):
         """Uploads the Sequence Tagger model to a Hugging Face Hub repository.
 
-        :param repo_id: A namespace (user or an organization) and a repo name separated by a `/`.
-        :param token: An authentication token (See https://huggingface.co/settings/token).
-        :param private: Whether the repository is private.
-        :param commit_message: Message to commit while pushing.
-        :return: The url of the repository.
+        Args:
+            repo_id: A namespace (user or an organization) and a repo name separated by a `/`.
+            token: An authentication token (See https://huggingface.co/settings/token).
+            private: Whether the repository is private.
+            commit_message: Message to commit while pushing.
+
+        Returns: The url of the repository.
         """
         # Lazy import
         from huggingface_hub import create_repo, model_info, upload_folder
diff --git a/flair/models/sequence_tagger_utils/crf.py b/flair/models/sequence_tagger_utils/crf.py
index 0b4e78e6c..085339dce 100644
--- a/flair/models/sequence_tagger_utils/crf.py
+++ b/flair/models/sequence_tagger_utils/crf.py
@@ -17,9 +17,10 @@ class CRF(torch.nn.Module):
     def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool) -> None:
         """Initialize the Conditional Random Field.
 
-        :param tag_dictionary: tag dictionary in order to find ID for start and stop tags
-        :param tagset_size: number of tag from tag dictionary
-        :param init_from_state_dict: whether we load pretrained model from state dict
+        Args:
+            tag_dictionary: tag dictionary in order to find ID for start and stop tags
+            tagset_size: number of tag from tag dictionary
+            init_from_state_dict: whether we load pretrained model from state dict
         """
         super().__init__()
 
@@ -37,9 +38,10 @@ def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool)
     def forward(self, features: torch.Tensor) -> torch.Tensor:
         """Forward propagation of Conditional Random Field.
 
-        :param features: output from RNN / Linear layer in shape (batch size, seq len, hidden size)
-        :return: CRF scores (emission scores for each token + transitions prob from previous state) in
-        shape (batch_size, seq len, tagset size, tagset size)
+        Args:
+            features: output from RNN / Linear layer in shape (batch size, seq len, hidden size)
+
+        Returns: CRF scores (emission scores for each token + transitions prob from previous state) in shape (batch_size, seq len, tagset size, tagset size)
         """
         batch_size, seq_len = features.size()[:2]
 
diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
index 1cae3c008..73c10fb67 100644
--- a/flair/models/sequence_tagger_utils/viterbi.py
+++ b/flair/models/sequence_tagger_utils/viterbi.py
@@ -19,7 +19,8 @@ class ViterbiLoss(torch.nn.Module):
     def __init__(self, tag_dictionary: Dictionary) -> None:
         """Create an instance of the Viterbi loss.
 
-        :param tag_dictionary: tag_dictionary of task
+        Args:
+            tag_dictionary: tag_dictionary of task
         """
         super().__init__()
         self.tag_dictionary = tag_dictionary
@@ -30,10 +31,11 @@ def __init__(self, tag_dictionary: Dictionary) -> None:
     def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor:
         """Forward propagation of Viterbi Loss.
 
-        :param features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size),
-            lengths of sentences in batch, transitions from CRF
-        :param targets: true tags for sentences which will be converted to matrix indices.
-        :return: summed Viterbi Loss over all data points
+        Args:
+            features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size), lengths of sentences in batch, transitions from CRF
+            targets: true tags for sentences which will be converted to matrix indices.
+
+        Returns: summed Viterbi Loss over all data points
         """
         features, lengths, transitions = features_tuple
 
@@ -82,9 +84,11 @@ def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor:
     def _log_sum_exp(tensor, dim):
         """Calculates the log-sum-exponent of a tensor's dimension in a numerically stable way.
 
-        :param tensor: tensor
-        :param dim: dimension to calculate log-sum-exp of
-        :return: log-sum-exp
+        Args:
+            tensor: tensor
+            dim: dimension to calculate log-sum-exp of
+
+        Returns: log-sum-exp
         """
         m, _ = torch.max(tensor, dim)
         m_expanded = m.unsqueeze(dim).expand_as(tensor)
@@ -99,8 +103,9 @@ def _format_targets(self, targets: torch.Tensor, lengths: torch.IntTensor):
         from previous tag 5 and could directly be addressed through the 1-dim indices (10 + tagset_size * 5) = 70,
         if our tagset consists of 12 tags.
 
-        :param targets: targets as in tag dictionary
-        :param lengths: lengths of sentences in batch
+        Args:
+            targets: targets as in tag dictionary
+            lengths: lengths of sentences in batch
         """
         targets_per_sentence = []
 
@@ -114,7 +119,7 @@ def _format_targets(self, targets: torch.Tensor, lengths: torch.IntTensor):
 
         matrix_indices = [
             [self.tag_dictionary.get_idx_for_item(START_TAG) + (s[0] * self.tagset_size)]
-            + [s[i] + (s[i + 1] * self.tagset_size) for i in range(0, len(s) - 1)]
+            + [s[i] + (s[i + 1] * self.tagset_size) for i in range(len(s) - 1)]
             for s in targets_per_sentence
         ]
 
@@ -127,7 +132,8 @@ class ViterbiDecoder:
     def __init__(self, tag_dictionary: Dictionary) -> None:
         """Initialize the Viterbi Decoder.
 
-        :param tag_dictionary: Dictionary of tags for sequence labeling task
+        Args:
+            tag_dictionary: Dictionary of tags for sequence labeling task
         """
         self.tag_dictionary = tag_dictionary
         self.tagset_size = len(tag_dictionary)
@@ -139,10 +145,12 @@ def decode(
     ) -> Tuple[List, List]:
         """Decoding function returning the most likely sequence of tags.
 
-        :param features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size),
-            lengths of sentence in batch, transitions of CRF
-        :param probabilities_for_all_classes: whether to return probabilities for all tags
-        :return: decoded sequences
+        Args:
+            features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size), lengths of sentence in batch, transitions of CRF
+            probabilities_for_all_classes: whether to return probabilities for all tags
+            sentences: list of the respective sentences with extracted features.
+
+        Returns: decoded sequences
         """
         features, lengths, transitions = features_tuple
         all_tags = []
@@ -218,10 +226,7 @@ def decode(
         return tags, all_tags
 
     def _all_scores_for_token(self, scores: torch.Tensor, lengths: torch.IntTensor, sentences: List[Sentence]):
-        """Returns all scores for each tag in tag dictionary.
-
-        :param scores: Scores for current sentence.
-        """
+        """Returns all scores for each tag in tag dictionary."""
         scores = scores.numpy()
         prob_tags_per_sentence = []
         for scores_sentence, length, sentence in zip(scores, lengths, sentences):
diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py
index 6bee5aee1..6a9287147 100644
--- a/flair/models/tars_model.py
+++ b/flair/models/tars_model.py
@@ -183,11 +183,13 @@ def add_and_switch_to_new_task(
         Sets necessary attributes and finally 'switches' to the new task. Parameters are similar to the constructor
         except for model choice, batch size and negative sampling. This method does not store the resultant model onto
         disk.
-        :param task_name: a string depicting the name of the task
-        :param label_dictionary: dictionary of the labels you want to predict
-        :param label_type: string to identify the label type ('ner', 'sentiment', etc.)
-        :param multi_label: whether this task is a multi-label prediction problem
-        :param force_switch: if True, will overwrite existing task with same name
+
+        Args:
+            task_name: a string depicting the name of the task
+            label_dictionary: dictionary of the labels you want to predict
+            label_type: string to identify the label type ('ner', 'sentiment', etc.)
+            multi_label: whether this task is a multi-label prediction problem
+            force_switch: if True, will overwrite existing task with same name
         """
         if task_name in self._task_specific_attributes and not force_switch:
             log.warning(f"Task `{task_name}` already exists in TARS model. Switching to it.")
@@ -262,9 +264,10 @@ def predict_zero_shot(
     ):
         """Make zero shot predictions from the TARS model.
 
-        :param sentences: input sentence objects to classify
-        :param candidate_label_set: set of candidate labels
-        :param multi_label: indicates whether multi-label or single class prediction. Defaults to True.
+        Args:
+            sentences: input sentence objects to classify
+            candidate_label_set: set of candidate labels
+            multi_label: indicates whether multi-label or single class prediction. Defaults to True.
         """
         # check if candidate_label_set is empty
         if candidate_label_set is None or len(candidate_label_set) == 0:
@@ -331,20 +334,23 @@ def __init__(
         label_dictionary: Optional[Dictionary] = None,
         label_type: Optional[str] = None,
         embeddings: Union[TransformerWordEmbeddings, str] = "bert-base-uncased",
-        num_negative_labels_to_sample: int = 2,
+        num_negative_labels_to_sample: Optional[int] = 2,
         prefix: bool = True,
         **tagger_args,
     ) -> None:
         """Initializes a TarsTagger.
 
-        :param task_name: a string depicting the name of the task
-        :param label_dictionary: dictionary of labels you want to predict
-        :param embeddings: name of the pre-trained transformer model e.g.,
-        'bert-base-uncased' etc
-        :param num_negative_labels_to_sample: number of negative labels to sample for each
-        positive labels against a sentence during training. Defaults to 2 negative
-        labels for each positive label. The model would sample all the negative labels
-        if None is passed. That slows down the training considerably.
+        Args:
+            task_name: a string depicting the name of the task
+            label_dictionary: dictionary of labels you want to predict
+            label_type: label_type: name of the label
+            embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased'
+            num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a
+                sentence during training. Defaults to 2 negative labels for each positive label. The model would sample
+                all the negative labels if None is passed. That slows down the training considerably.
+            prefix: if True, the label will be concatenated at the start, else on the end.
+            **tagger_args: The arguments propagated to :meth:`FewshotClassifier.__init__`
+
         """
         super().__init__()
 
@@ -664,24 +670,27 @@ def __init__(
         label_dictionary: Optional[Dictionary] = None,
         label_type: Optional[str] = None,
         embeddings: Union[TransformerDocumentEmbeddings, str] = "bert-base-uncased",
-        num_negative_labels_to_sample: int = 2,
+        num_negative_labels_to_sample: Optional[int] = 2,
         prefix: bool = True,
         **tagger_args,
     ) -> None:
         """Initializes a TarsClassifier.
 
-        :param task_name: a string depicting the name of the task
-        :param label_dictionary: dictionary of labels you want to predict
-        :param embeddings: name of the pre-trained transformer model e.g.,
-        'bert-base-uncased' etc
-        :param num_negative_labels_to_sample: number of negative labels to sample for each
-        positive labels against a sentence during training. Defaults to 2 negative
-        labels for each positive label. The model would sample all the negative labels
-        if None is passed. That slows down the training considerably.
-        :param multi_label: auto-detected by default, but you can set this to True
-        to force multi-label predictionor False to force single-label prediction
-        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
-        :param beta: Parameter for F-beta score for evaluation and training annealing
+        Args:
+            task_name: a string depicting the name of the task.
+            label_dictionary: dictionary of labels you want to predict.
+            label_type: label_type: name of the label
+            embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased'.
+            num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a
+                sentence during training. Defaults to 2 negative labels for each positive label.
+                The model would sample all the negative labels if None is passed.
+                That slows down the training considerably.
+            multi_label: auto-detected by default, but you can set this to True to force multi-label predictions
+                or False to force single-label predictions.
+            multi_label_threshold: If multi-label you can set the threshold to make predictions.
+            beta: Parameter for F-beta score for evaluation and training annealing.
+            prefix: if True, the label will be concatenated at the start, else on the end.
+            **tagger_args: The arguments propagated to :meth:`FewshotClassifier.__init__`
         """
         super().__init__()
 
diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index 25a6b1d11..1b330a0da 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -29,14 +29,17 @@ def __init__(
     ) -> None:
         """Initializes a TextClassifier.
 
-        :param embeddings: embeddings used to embed each data point
-        :param label_dictionary: dictionary of labels you want to predict
-        :param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction
-        or False to force single-label prediction
-        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
-        :param beta: Parameter for F-beta score for evaluation and training annealing
-        :param loss_weights: Dictionary of weights for labels for the loss function
-        (if any label's weight is unspecified it will default to 1.0)
+        Args:
+            embeddings: embeddings used to embed each data point
+            label_dictionary: dictionary of labels you want to predict
+            label_type: string identifier for tag type
+            multi_label: auto-detected by default, but you can set this to True to force multi-label predictions
+                or False to force single-label predictions.
+            multi_label_threshold: If multi-label you can set the threshold to make predictions
+            beta: Parameter for F-beta score for evaluation and training annealing
+            loss_weights: Dictionary of weights for labels for the loss function. If any label's weight is
+                unspecified it will default to 1.0
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         super().__init__(
             **classifierargs,
diff --git a/flair/models/word_tagger_model.py b/flair/models/word_tagger_model.py
index 32f58e17b..2d32a54b0 100644
--- a/flair/models/word_tagger_model.py
+++ b/flair/models/word_tagger_model.py
@@ -3,6 +3,7 @@
 from typing import Any, Dict, List, Union
 
 import torch
+from deprecated.sphinx import deprecated
 
 import flair.nn
 from flair.data import Dictionary, Sentence, Span, Token
@@ -11,15 +12,6 @@
 log = logging.getLogger("flair")
 
 
-def WordTagger(embeddings, tag_dictionary, tag_type, **classifierargs):
-    from warnings import warn
-
-    warn("The WordTagger class is deprecated and will be removed in Flair 1.0. Use TokenClassifier instead!")
-    return TokenClassifier(
-        embeddings=embeddings, label_dictionary=tag_dictionary, label_type=tag_type, **classifierargs
-    )
-
-
 class TokenClassifier(flair.nn.DefaultClassifier[Sentence, Token]):
     """This is a simple class of models that tags individual words in text."""
 
@@ -33,9 +25,12 @@ def __init__(
     ) -> None:
         """Initializes a TokenClassifier.
 
-        :param embeddings: word embeddings used in tagger
-        :param tag_dictionary: dictionary of tags you want to predict
-        :param tag_type: string identifier for tag type
+        Args:
+            embeddings: word embeddings used in tagger
+            label_dictionary: dictionary of labels or BIO/BIOES tags you want to predict
+            label_type: string identifier for tag type
+            span_encoding: the format to encode spans as tags, either "BIO" or "BIOES"
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         # if the classifier predicts BIO/BIOES span labels, the internal label dictionary must be computed
         if label_dictionary.span_labels:
@@ -231,3 +226,8 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "TokenClassifier"
         from typing import cast
 
         return cast("TokenClassifier", super().load(model_path=model_path))
+
+
+@deprecated(reason="The WordTagger was renamed to :class:`flair.models.TokenClassifier`.", version="0.12.2")
+class WordTagger(TokenClassifier):
+    pass
diff --git a/flair/nn/distance/cosine.py b/flair/nn/distance/cosine.py
index a19cc8452..d92188ab1 100644
--- a/flair/nn/distance/cosine.py
+++ b/flair/nn/distance/cosine.py
@@ -6,8 +6,12 @@
 def dot_product(a: torch.Tensor, b: torch.Tensor, normalize=False):
     """Computes dot product for pairs of vectors.
 
-    :param normalize: Vectors are normalized (leads to cosine similarity)
-    :return: Matrix with res[i][j]  = dot_product(a[i], b[j])
+    Args:
+        a: the left tensor
+        b: the right tensor
+        normalize: Vectors are normalized (leads to cosine similarity)
+
+    Returns: Matrix with res[i][j] = dot_product(a[i], b[j])
     """
     if len(a.shape) == 1:
         a = a.unsqueeze(0)
diff --git a/flair/nn/model.py b/flair/nn/model.py
index 2e77d67d3..dd58c0456 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -62,10 +62,22 @@ def evaluate(
         """Evaluates the model. Returns a Result object containing evaluation results and a loss value.
 
         Implement this to enable evaluation.
-        :param data_loader: DataLoader that iterates over dataset to be evaluated
-        :param out_path: Optional output path to store predictions
-        :param embedding_storage_mode: One of 'none', 'cpu' or 'gpu'. 'none' means all embeddings are deleted and freshly recomputed, 'cpu' means all embeddings are stored on CPU, or 'gpu' means all embeddings are stored on GPU  # noqa: E501
-        :return: Returns a Tuple consisting of a Result object and a loss float value
+
+        Args:
+            data_points: The labeled data_points to evaluate.
+            gold_label_type: The label type indicating the gold labels
+            out_path: Optional output path to store predictions
+            embedding_storage_mode: One of 'none', 'cpu' or 'gpu'. 'none' means all embeddings are deleted and freshly
+              recomputed, 'cpu' means all embeddings are stored on CPU, or 'gpu' means all embeddings are stored on GPU
+            mini_batch_size: The batch_size to use for predictions
+            main_evaluation_metric: Specify which metric to highlight as main_score
+            exclude_labels: Specify classes that won't be considered in evaluation
+            gold_label_dictionary: Specify which classes should be considered, all other classes will be taken as <unk>.
+            return_loss: Weather to additionally compute the loss on the data-points.
+            **kwargs: Arguments that will be ignored.
+
+        Returns:
+            The evaluation results.
         """
         raise NotImplementedError
 
@@ -100,7 +112,9 @@ def _fetch_model(model_name) -> str:
     def save(self, model_file: Union[str, Path], checkpoint: bool = False):
         """Saves the current model to the provided file.
 
-        :param model_file: the model file
+        Args:
+            model_file: the model file
+            checkpoint: currently unused.
         """
         model_state = self._get_state_dict()
 
@@ -115,8 +129,10 @@ def save(self, model_file: Union[str, Path], checkpoint: bool = False):
     def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "Model":
         """Loads the model from the given file.
 
-        :param model_path: the model file or the already loaded state dict
-        :return: the loaded text classifier model
+        Args:
+            model_path: the model file or the already loaded state dict
+
+        Returns: the loaded text classifier model
         """
         # if this class is abstract, go through all inheriting classes and try to fetch and load the model
         if inspect.isabstract(cls):
@@ -498,13 +514,15 @@ def predict(
         """Predicts the class labels for the given sentences.
 
         The labels are directly added to the sentences.
-        :param sentences: list of sentences
-        :param mini_batch_size: mini batch size to use
-        :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted  # noqa: E501
-        :param verbose: set to True to display a progress bar
-        :param return_loss: set to True to return loss
-        :param label_name: set this to change the name of the label type that is predicted  # noqa: E501
-        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory.  # noqa: E501
+
+        Args:
+            sentences: list of sentences
+            mini_batch_size: mini batch size to use
+            return_probabilities_for_all_classes: return probabilities for all classes instead of only best predicted
+            verbose: set to True to display a progress bar
+            return_loss: set to True to return loss
+            label_name: set this to change the name of the label type that is predicted  # noqa: E501
+            embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory.  # noqa: E501
         """
         raise NotImplementedError
 
@@ -767,14 +785,15 @@ def predict(
     ):
         """Predicts the class labels for the given sentences. The labels are directly added to the sentences.
 
-        :param sentences: list of sentences
-        :param mini_batch_size: mini batch size to use
-        :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted  # noqa: E501
-        :param verbose: set to True to display a progress bar
-        :param return_loss: set to True to return loss
-        :param label_name: set this to change the name of the label type that is predicted
-        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.  # noqa: E501
-        'gpu' to store embeddings in GPU memory.
+        Args:
+            sentences: list of sentences to predict
+            mini_batch_size: the amount of sentences that will be predicted within one batch
+            return_probabilities_for_all_classes: return probabilities for all classes instead of only best predicted
+            verbose: set to True to display a progress bar
+            return_loss: set to True to return loss
+            label_name: set this to change the name of the label type that is predicted
+            embedding_storage_mode: default is 'none' which is the best is most cases.
+                Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory.
         """
         if label_name is None:
             label_name = self.label_type if self.label_type is not None else "label"
diff --git a/flair/samplers.py b/flair/samplers.py
index e20646624..135dfb331 100644
--- a/flair/samplers.py
+++ b/flair/samplers.py
@@ -13,7 +13,8 @@ class FlairSampler(Sampler):
     def set_dataset(self, data_source):
         """Initialize the data source for the FlairSampler.
 
-        :param data_source: dataset to sample from.
+        Args:
+            data_source: dataset to sample from.
         """
         self.data_source = data_source
         self.num_samples = len(self.data_source)
@@ -29,10 +30,7 @@ def __init__(self) -> None:
         super().__init__(None)
 
     def set_dataset(self, data_source):
-        """Initialize the dataset used for sampling.
-
-        :param data_source:
-        """
+        """Initialize the dataset used for sampling."""
         self.data_source = data_source
         self.num_samples = len(self.data_source)
         self.indices = list(range(len(data_source)))
@@ -91,7 +89,8 @@ class ExpandingChunkSampler(FlairSampler):
     def __init__(self, step=3) -> None:
         """Initialize the ExpandingChunkSampler.
 
-        :param step: every *step* epochs the block size increments by one.
+        Args:
+            step: every *step* epochs the block size increments by one.
         """
         super().__init__(None)
         self.block_size = 1
diff --git a/flair/splitter.py b/flair/splitter.py
index cf57b36bd..90464bfc2 100644
--- a/flair/splitter.py
+++ b/flair/splitter.py
@@ -97,8 +97,9 @@ class SpacySentenceSplitter(SentenceSplitter):
 
     Implementation of :class:`SentenceSplitter`, using models from Spacy.
 
-    :param model Spacy V2 model or the name of the model to load.
-    :param tokenizer Custom tokenizer to use (default :class:`SpacyTokenizer`)
+    Args:
+        model: Spacy V2 model or the name of the model to load.
+        tokenizer: Custom tokenizer to use (default :class:`SpacyTokenizer`)
     """
 
     def __init__(self, model: Union[Any, str], tokenizer: Optional[Tokenizer] = None) -> None:
diff --git a/flair/tokenization.py b/flair/tokenization.py
index ab4c0d239..af77e5f57 100644
--- a/flair/tokenization.py
+++ b/flair/tokenization.py
@@ -31,9 +31,8 @@ def name(self) -> str:
 class SpacyTokenizer(Tokenizer):
     """Tokenizer using spacy under the hood.
 
-    Implementation of :class:`Tokenizer`, using models from Spacy.
-
-    :param model a Spacy V2 model or the name of the model to load.
+    Args:
+        model: a Spacy V2 model or the name of the model to load.
     """
 
     def __init__(self, model) -> None:
diff --git a/flair/trainers/plugins/base.py b/flair/trainers/plugins/base.py
index 993273599..709866dad 100644
--- a/flair/trainers/plugins/base.py
+++ b/flair/trainers/plugins/base.py
@@ -38,7 +38,8 @@ class Pluggable:
     def __init__(self, *, plugins: Sequence[PluginArgument] = []) -> None:
         """Initialize a `Pluggable`.
 
-        :param plugins: Plugins which should be attached to this `Pluggable`.
+        Args:
+            plugins: Plugins which should be attached to this `Pluggable`.
         """
         self._hook_handles: Dict[EventIdenifier, Dict[HookHandleId, HookHandle]] = defaultdict(dict)
 
@@ -77,8 +78,9 @@ def validate_event(self, *events: EventIdenifier):
     def register_hook(self, func: Callable, *events: EventIdenifier):
         """Register a hook.
 
-        :param func: Function to be called when the event is emitted.
-        :param *events: List of events to call this function on.
+        Args:
+            func: Function to be called when the event is emitted.
+            *events: List of events to call this function on.
         """
         self.validate_event(*events)
 
@@ -123,10 +125,11 @@ def __init__(
     ) -> None:
         """Intitialize `HookHandle`.
 
-        :param _id: Id, the callback is stored as in the `Pluggable`.
-        :param *events: List of events, the callback is registered for.
-        :param func: The callback function.
-        :param pluggable: The `Pluggable` where the callback is registered.
+        Args:
+            _id: Id, the callback is stored as in the `Pluggable`.
+            events: List of events, the callback is registered for.
+            func: The callback function.
+            pluggable: The `Pluggable` where the callback is registered.
         """
         pluggable.validate_event(*events)
 
@@ -194,7 +197,7 @@ def attach_to(self, pluggable: Pluggable):
             try:
                 func = getattr(self, name)
 
-                # get attribute hook events (mayr aise an AttributeError)
+                # get attribute hook events (may raise an AttributeError)
                 events = func._plugin_hook_events
 
                 # register function as a hook
diff --git a/flair/trainers/plugins/functional/amp.py b/flair/trainers/plugins/functional/amp.py
index a0040b76a..411b7d372 100644
--- a/flair/trainers/plugins/functional/amp.py
+++ b/flair/trainers/plugins/functional/amp.py
@@ -39,11 +39,7 @@ def backward(self, loss):
 
     @TrainerPlugin.hook
     def after_setup(self, **kw):
-        """Wraps with AMP.
-
-        :param kw:
-        :return:
-        """
+        """Wraps with AMP."""
         optimizer = self.trainer.optimizer
 
         self.trainer.model, self.trainer.optimizer = self.amp.initialize(
diff --git a/flair/trainers/plugins/functional/anneal_on_plateau.py b/flair/trainers/plugins/functional/anneal_on_plateau.py
index f33c60b77..69646ef5d 100644
--- a/flair/trainers/plugins/functional/anneal_on_plateau.py
+++ b/flair/trainers/plugins/functional/anneal_on_plateau.py
@@ -51,13 +51,7 @@ def after_setup(
         optimizer,
         **kw,
     ):
-        """Initialize different schedulers, including anneal target for AnnealOnPlateau, batch_growth_annealing, loading schedulers.
-
-        :param train_with_dev:
-        :param optimizer:
-        :param kw:
-        :return:
-        """
+        """Initialize different schedulers, including anneal target for AnnealOnPlateau, batch_growth_annealing, loading schedulers."""
         # minimize training loss if training with dev data, else maximize dev score
         anneal_mode = "min" if train_with_dev else "max"
 
@@ -75,13 +69,7 @@ def after_setup(
 
     @TrainerPlugin.hook
     def after_evaluation(self, current_model_is_best, validation_scores, **kw):
-        """Scheduler step of AnnealOnPlateau.
-
-        :param current_model_is_best:
-        :param validation_scores:
-        :param kw:
-        :return:
-        """
+        """Scheduler step of AnnealOnPlateau."""
         reduced_learning_rate: bool = self.scheduler.step(*validation_scores)
 
         self.store_learning_rate()
diff --git a/flair/trainers/plugins/functional/checkpoints.py b/flair/trainers/plugins/functional/checkpoints.py
index f3bfc3ff6..f1f7020f4 100644
--- a/flair/trainers/plugins/functional/checkpoints.py
+++ b/flair/trainers/plugins/functional/checkpoints.py
@@ -19,12 +19,7 @@ def __init__(
 
     @TrainerPlugin.hook
     def after_training_epoch(self, epoch, **kw):
-        """Saves the model each k epochs.
-
-        :param epoch:
-        :param kw:
-        :return:
-        """
+        """Saves the model each k epochs."""
         if self.save_model_each_k_epochs > 0 and epoch % self.save_model_each_k_epochs == 0:
             log.info(
                 f"Saving model at current epoch since 'save_model_each_k_epochs={self.save_model_each_k_epochs}' "
diff --git a/flair/trainers/plugins/functional/linear_scheduler.py b/flair/trainers/plugins/functional/linear_scheduler.py
index 08aca32c2..51b295c7d 100644
--- a/flair/trainers/plugins/functional/linear_scheduler.py
+++ b/flair/trainers/plugins/functional/linear_scheduler.py
@@ -31,14 +31,7 @@ def after_setup(
         max_epochs,
         **kw,
     ):
-        """Initialize different schedulers, including anneal target for AnnealOnPlateau, batch_growth_annealing, loading schedulers.
-
-        :param dataset_size:
-        :param mini_batch_size:
-        :param max_epochs:
-        :param kw:
-        :return:
-        """
+        """Initialize different schedulers, including anneal target for AnnealOnPlateau, batch_growth_annealing, loading schedulers."""
         # calculate warmup steps
         steps_per_epoch = (dataset_size + mini_batch_size - 1) / mini_batch_size
         num_train_steps = int(steps_per_epoch * max_epochs)
@@ -52,21 +45,13 @@ def after_setup(
 
     @TrainerPlugin.hook
     def before_training_epoch(self, **kw):
-        """Load state for anneal_with_restarts, batch_growth_annealing, logic for early stopping.
-
-        :param kw:
-        :return:
-        """
+        """Load state for anneal_with_restarts, batch_growth_annealing, logic for early stopping."""
         self.store_learning_rate()
         self.previous_learning_rate = self.current_learning_rate
 
     @TrainerPlugin.hook
     def after_training_batch(self, optimizer_was_run: bool, **kw):
-        """Do the scheduler step if one-cycle or linear decay.
-
-        :param kw:
-        :return:
-        """
+        """Do the scheduler step if one-cycle or linear decay."""
         # skip if no optimization has happened.
         if not optimizer_was_run:
             return
diff --git a/flair/trainers/plugins/functional/weight_extractor.py b/flair/trainers/plugins/functional/weight_extractor.py
index 3edd5ff79..a6ed5eab2 100644
--- a/flair/trainers/plugins/functional/weight_extractor.py
+++ b/flair/trainers/plugins/functional/weight_extractor.py
@@ -11,14 +11,7 @@ def __init__(self, base_path) -> None:
 
     @TrainerPlugin.hook
     def after_training_batch(self, batch_no, epoch, total_number_of_batches, **kw):
-        """Extracts weights.
-
-        :param batch_no:
-        :param epoch:
-        :param total_number_of_batches:
-        :param kw:
-        :return:
-        """
+        """Extracts weights."""
         modulo = max(1, int(total_number_of_batches / 10))
         iteration = epoch * total_number_of_batches + batch_no
 
diff --git a/flair/trainers/plugins/loggers/loss_file.py b/flair/trainers/plugins/loggers/loss_file.py
index 9658aa6b0..f19ef918b 100644
--- a/flair/trainers/plugins/loggers/loss_file.py
+++ b/flair/trainers/plugins/loggers/loss_file.py
@@ -60,21 +60,12 @@ def __init__(
 
     @TrainerPlugin.hook
     def before_training_epoch(self, epoch, **kw):
-        """Get the current epoch for loss file logging.
-
-        :param epoch:
-        :param kw:
-        :return:
-        """
+        """Get the current epoch for loss file logging."""
         self.current_row = {MetricName("epoch"): epoch}
 
     @TrainerPlugin.hook
     def metric_recorded(self, record):
-        """Add the metric of a record to the current row.
-
-        :param record:
-        :return:
-        """
+        """Add the metric of a record to the current row."""
         if record.name in self.headers and self.current_row is not None:
             if record.name == "learning_rate" and not record.is_scalar:
                 # record is a list of scalars
@@ -90,12 +81,7 @@ def metric_recorded(self, record):
 
     @TrainerPlugin.hook
     def after_evaluation(self, epoch, **kw):
-        """This prints all relevant metrics.
-
-        :param epoch:
-        :param kw:
-        :return:
-        """
+        """This prints all relevant metrics."""
         if self.loss_txt is not None:
             self.current_row[MetricName("timestamp")] = f"{datetime.now():%H:%M:%S}"
 
diff --git a/flair/trainers/plugins/loggers/metric_history.py b/flair/trainers/plugins/loggers/metric_history.py
index fe9166fc1..46802824d 100644
--- a/flair/trainers/plugins/loggers/metric_history.py
+++ b/flair/trainers/plugins/loggers/metric_history.py
@@ -30,9 +30,5 @@ def metric_recorded(self, record):
 
     @TrainerPlugin.hook
     def after_training(self, **kw):
-        """Returns metric history.
-
-        :param kw:
-        :return:
-        """
+        """Returns metric history."""
         self.trainer.return_values.update(self.metric_history)
diff --git a/flair/trainers/plugins/loggers/tensorboard.py b/flair/trainers/plugins/loggers/tensorboard.py
index 1700768dc..8fc8af9e9 100644
--- a/flair/trainers/plugins/loggers/tensorboard.py
+++ b/flair/trainers/plugins/loggers/tensorboard.py
@@ -13,8 +13,11 @@ class TensorboardLogger(TrainerPlugin):
     def __init__(self, log_dir=None, comment="", tracked_metrics=()) -> None:
         """Initializes the TensorboardLogger.
 
-        :param log_dir: Directory into which tensorboard log files will be written  # noqa: E501
-        :param tracked_metrics: List of tuples that specify which metrics (in addition to the main_score) shall be plotted in tensorboard, could be [("macro avg", 'f1-score'), ("macro avg", 'precision')] for example  # noqa: E501
+        Args:
+            log_dir: Directory into which tensorboard log files will be written
+            comment: The comment to specify Comment log_dir suffix appended to the default
+              ``log_dir``. If ``log_dir`` is assigned, this argument has no effect.
+            tracked_metrics: List of tuples that specify which metrics (in addition to the main_score) shall be plotted in tensorboard, could be [("macro avg", 'f1-score'), ("macro avg", 'precision')] for example
         """
         super().__init__()
         self.comment = comment
@@ -50,10 +53,6 @@ def metric_recorded(self, record):
 
     @TrainerPlugin.hook
     def _training_finally(self, **kw):
-        """Closes the writer.
-
-        :param kw:
-        :return:
-        """
+        """Closes the writer."""
         assert self.writer is not None
         self.writer.close()
diff --git a/flair/trainers/plugins/metric_records.py b/flair/trainers/plugins/metric_records.py
index 6648f9f22..034c02185 100644
--- a/flair/trainers/plugins/metric_records.py
+++ b/flair/trainers/plugins/metric_records.py
@@ -76,11 +76,12 @@ def __init__(
     ) -> None:
         """Create a metric record.
 
-        :param name: Name of the metric.
-        :param typ: Type of metric.
-        :param value: Value of the metric (can be anything: scalar, tensor,
-            image, etc.).
-        :param walltime: Time of recording this metric.
+        Args:
+            name: Name of the metric.
+            typ: Type of metric.
+            value: Value of the metric (can be anything: scalar, tensor, image, etc.).
+            global_step: The time_step of the log. This should be incremented the next time this metric is logged again. E.g. if you log every epoch, set the global_step to the current epoch.
+            walltime: Time of recording this metric.
         """
         self.name: MetricName = MetricName(name)
         self.typ: RecordType = typ
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 3cf96f559..7db6a7d17 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -16,6 +16,7 @@
 import flair.nn
 from flair.data import Corpus, Dictionary, _len_dataset
 from flair.datasets import DataLoader
+from flair.samplers import FlairSampler
 from flair.trainers.plugins import (
     AnnealingPlugin,
     CheckpointPlugin,
@@ -54,8 +55,9 @@ class ModelTrainer(Pluggable):
     def __init__(self, model: flair.nn.Model, corpus: Corpus) -> None:
         """Initialize a model trainer.
 
-        :param model: The model that you want to train. The model should inherit from flair.nn.Model  # noqa: E501
-        :param corpus: The dataset used to train the model, should be of type Corpus
+        Args:
+            model: The model that you want to train. The model should inherit from flair.nn.Model  # noqa: E501
+            corpus: The dataset used to train the model, should be of type Corpus
         """
         super().__init__()
         self.model: flair.nn.Model = model
@@ -307,7 +309,7 @@ def train_custom(
         gold_label_dictionary_for_eval: Optional[Dictionary] = None,
         exclude_labels: List[str] = [],
         # sampling and shuffling
-        sampler=None,
+        sampler: Optional[FlairSampler] = None,
         shuffle: bool = True,
         shuffle_first_epoch: bool = True,
         # evaluation and monitoring
@@ -331,24 +333,23 @@ def train_custom(
 
         Args:
             base_path: Main path to which all output during training is logged and models are saved
-            learning_rate (float): The learning rate of the optimizer
-            decoder_learning_rate (Optional[float]): Optional, if set, the decoder is trained with a separate learning rate
-            mini_batch_size (int): Size of mini-batches during training
-            eval_batch_size (int): Size of mini-batches during evaluation
-            mini_batch_chunk_size (int): If mini-batches are larger than this number, they get broken down into chunks of
+            learning_rate: The learning rate of the optimizer
+            decoder_learning_rate: Optional, if set, the decoder is trained with a separate learning rate
+            mini_batch_size: Size of mini-batches during training
+            eval_batch_size: Size of mini-batches during evaluation
+            mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of
                 this size for processing purposes
-            max_epochs (int): Maximum number of epochs to train. Terminates training if this number is surpassed.
+            max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
             optimizer: The optimizer to use (typically SGD or Adam)
-            train_with_dev (bool): If True, the data from dev split is added to the training data
-            train_with_test (bool): If True, the data from test split is added to the training data
+            train_with_dev: If True, the data from dev split is added to the training data
+            train_with_test: If True, the data from test split is added to the training data
             main_evaluation_metric: The metric to optimize (often micro-average or macro-average F1-score, or accuracy)
-            monitor_test (bool): If True, test data is evaluated at end of each epoch
+            monitor_test: If True, test data is evaluated at end of each epoch
             monitor_train_sample: Set this to evaluate on a sample of the train data at the end of each epoch.
                 If you set an int, it will sample this many sentences to evaluate on. If you set a float, it will sample
                 a percentage of data points from train.
-            max_grad_norm (Optional[float]): If not None, gradients are clipped to this value before an optimizer.step is
-                called.
-            use_final_model_for_eval (bool): If True, the final model is used for the final evaluation. If False, the
+            max_grad_norm: If not None, gradients are clipped to this value before an optimizer.step is called.
+            use_final_model_for_eval: If True, the final model is used for the final evaluation. If False, the
                 model from the best epoch as determined by main_evaluation_metric is used for the final evaluation.
             gold_label_dictionary_for_eval: Set to force evaluation to use a particular label dictionary
             exclude_labels: Optionally define a list of labels to exclude from the evaluation
@@ -359,20 +360,19 @@ def train_custom(
                 'cpu' (embeddings stored on CPU) or 'gpu' (embeddings stored on GPU)
             epoch: The starting epoch (normally 0 but could be higher if you continue training model)
             save_final_model: If True, the final model is saved at the end of training.
-            save_optimizer_state (bool): If True, the optimizer state is saved alongside the model
+            save_optimizer_state: If True, the optimizer state is saved alongside the model
             save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will
                 be saved each 5 epochs. Default is 0 which means no model saving.
-            create_file_logs (bool): If True, logging output is written to a file
-            create_loss_file (bool): If True, a loss file logging output is created
-            use_amp (bool): If True, uses the torch automatic mixed precision
-            write_weights (bool): If True, write weights to weights.txt on each batch logging event.
+            create_file_logs: If True, logging output is written to a file
+            create_loss_file: If True, a loss file logging output is created
+            use_amp: If True, uses the torch automatic mixed precision
+            write_weights: If True, write weights to weights.txt on each batch logging event.
             plugins: Any additional plugins you want to pass to the trainer
             **kwargs: Additional arguments, for instance for the optimizer
 
         Returns:
-        -------
-        dict: A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins
-                add additional information to this dictionary, such as the :class:`MetricHistoryPlugin`
+            A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins add
+            additional information to this dictionary, such as the :class:`flair.trainers.plugins.MetricHistoryPlugin`
         """
         # Create output folder
         base_path = Path(base_path)
@@ -463,7 +463,7 @@ def train_custom(
             if inspect.isclass(sampler):
                 sampler = sampler()
             # set dataset to sample from
-            sampler.set_dataset(train_data)
+            sampler.set_dataset(train_data)  # type: ignore[union-attr]
             shuffle = False
 
         # this field stores the names of all dynamic embeddings in the model (determined after first forward pass)
@@ -840,11 +840,7 @@ def _publish_eval_result(self, result, prefix=(), **kw):
         )
 
     def _initialize_model_card(self, **training_parameters):
-        """Initializes model card with library versions and parameters.
-
-        :param training_parameters:
-        :return:
-        """
+        """Initializes model card with library versions and parameters."""
         # create a model card for this model with Flair and PyTorch version
         model_card = {
             "flair_version": flair.__version__,
diff --git a/flair/training_utils.py b/flair/training_utils.py
index e465f86c1..387ad63ae 100644
--- a/flair/training_utils.py
+++ b/flair/training_utils.py
@@ -328,9 +328,11 @@ def load_state_dict(self, state_dict):
 def init_output_file(base_path: Union[str, Path], file_name: str) -> Path:
     """Creates a local file which can be appended to.
 
-    :param base_path: the path to the directory
-    :param file_name: the file name
-    :return: the created file
+    Args:
+        base_path: the path to the directory
+        file_name: the file name
+
+    Returns: the created file
     """
     base_path = Path(base_path)
     base_path.mkdir(parents=True, exist_ok=True)
@@ -343,9 +345,11 @@ def init_output_file(base_path: Union[str, Path], file_name: str) -> Path:
 def convert_labels_to_one_hot(label_list: List[List[str]], label_dict: Dictionary) -> List[List[int]]:
     """Convert list of labels to a one hot list.
 
-    :param label_list: list of labels
-    :param label_dict: label dictionary
-    :return: converted label list
+    Args:
+        label_list: list of labels
+        label_dict: label dictionary
+
+    Returns: converted label list
     """
     return [[1 if label in labels else 0 for label in label_dict.get_items()] for labels in label_list]
 
diff --git a/flair/visual/ner_html.py b/flair/visual/ner_html.py
index fb7a09127..c71e10837 100644
--- a/flair/visual/ner_html.py
+++ b/flair/visual/ner_html.py
@@ -56,12 +56,15 @@ def render_ner_html(
 ) -> str:
     """Create the html code to visualize some sentences.
 
-    :param sentences: single sentence or list of sentences to convert to HTML
-    :param title: title of the HTML page
-    :param colors: dict where keys are tags and values are color HTML codes
-    :param default_color: color to use if colors parameter is missing a tag
-    :param wrap_page: if True method returns result of processing sentences wrapped by &lt;html&gt; and &lt;body&gt; tags, otherwise - without these tags  # noqa: E501
-    :return: HTML as a string
+    Args:
+        sentences: single sentence or list of sentences to convert to HTML
+        title: title of the HTML page
+        colors: dict where keys are tags and values are color HTML codes
+        default_color: color to use if colors parameter is missing a tag
+        wrap_page: if True method returns result of processing sentences wrapped by &lt;html&gt; and &lt;body&gt; tags, otherwise - without these tags
+        label_name: the label name to specify which labels of the sentence are visualized.
+
+    Returns: HTML as a string
     """
     if isinstance(sentences, Sentence):
         sentences = [sentences]
diff --git a/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md b/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
index 75c312ee6..7c7cf4644 100644
--- a/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
+++ b/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
@@ -1,7 +1,7 @@
 # Transformer Embeddings
 
 Flair supports various Transformer-based architectures like BERT or XLNet from [HuggingFace](https://github.com/huggingface), 
-with two classes `TransformerWordEmbeddings` (to embed words) and `TransformerDocumentEmbeddings` (to embed documents).
+with two classes [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) (to embed words or tokens) and [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) (to embed documents).
 
 ## Embeddings Words with Transformers
 
@@ -35,12 +35,12 @@ sentence = Sentence('The grass is green .')
 embedding.embed(sentence)
 ```
 
-[Here](https://huggingface.co/transformers/pretrained_models.html) is a full list of all models (BERT, RoBERTa, XLM, XLNet etc.). You can use any of these models with this class.
+[Here](https://https://huggingface.co/models) you can search for models to use. You can use any NLP model.
 
 
-## Embeddings Documents with Transformers
+## Embedding Documents with Transformers
 
-To embed a whole sentence as one (instead of each word in the sentence), simply use the TransformerDocumentEmbeddings 
+To embed a whole sentence as one (instead of each word in the sentence), simply use the [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) 
 instead:
 
 ```python
@@ -58,18 +58,18 @@ embedding.embed(sentence)
 
 ## Arguments
 
-There are several options that you can set when you init the TransformerWordEmbeddings 
-and TransformerDocumentEmbeddings classes:
+There are several options that you can set when you init the [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) 
+and [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) classes:
 
-| Argument             | Default             | Description
-| -------------------- | ------------------- | ------------------------------------------------------------------------------
-| `model` | `bert-base-uncased` | The string identifier of the transformer model you want to use (see above)
-| `layers`             | `all`       | Defines the layers of the Transformer-based model that produce the embedding
-| `subtoken_pooling`  | `first`             | See [Pooling operation section](#Pooling-operation).
-| `layer_mean`     | `True`             | See [Layer mean section](#Layer-mean).
-| `fine_tune`     | `False`             | Whether or not embeddings are fine-tuneable.
-| `allow_long_sentences`     | `True`             | Whether or not texts longer than maximal sequence length are supported.
-| `use_context` | `False`             | Set to True to include context outside of sentences. This can greatly increase accuracy on some tasks, but slows down embedding generation
+| Argument               | Default              | Description
+|------------------------|----------------------| ------------------------------------------------------------------------------
+| `model`                | `bert-base-uncased`  | The string identifier of the transformer model you want to use (see above)
+| `layers`               | `all`                | Defines the layers of the Transformer-based model that produce the embedding
+| `subtoken_pooling`     | `first`              | See [Pooling operation section](#Pooling-operation).
+| `layer_mean`           | `True`               | See [Layer mean section](#Layer-mean).
+| `fine_tune`            | `False`              | Whether or not embeddings are fine-tuneable.
+| `allow_long_sentences` | `True`               | Whether or not texts longer than maximal sequence length are supported.
+| `use_context`          | `False`              | Set to True to include context outside of sentences. This can greatly increase accuracy on some tasks, but slows down embedding generation.
 
 
 ### Layers
@@ -116,7 +116,7 @@ I.e. the size of the embedding increases the mode layers we use (but ONLY if lay
 
 ### Pooling operation
 
-Most of the Transformer-based models (except Transformer-XL) use subword tokenization. E.g. the following
+Most of the Transformer-based models use subword tokenization. E.g. the following
 token `puppeteer` could be tokenized into the subwords: `pupp`, `##ete` and `##er`.
 
 We implement different pooling operations for these subwords to generate the final token representation:
@@ -138,7 +138,7 @@ print(sentence[0].embedding.size())
 ### Layer mean
 
 The Transformer-based models have a certain number of layers. By default, all layers you select are
-concatenated as explained above. Alternatively, you can set layer_mean=True to do a mean over all
+concatenated as explained above. Alternatively, you can set `layer_mean=True` to do a mean over all
 selected layers. The resulting vector will then always have the same dimensionality as a single layer:
 
 ```python
@@ -175,11 +175,4 @@ tensor([-0.0323, -0.3904, -1.1946,  ...,  0.1305, -0.1365, -0.4323],
 
 ### Models
 
-Please have a look at the awesome Hugging Face [documentation](https://huggingface.co/transformers/v2.3.0/pretrained_models.html)
-for all supported pretrained models!
-
-
-## Next
-
-You can now either go back to the [embedding overview](/resources/docs/TUTORIAL_EMBEDDINGS_OVERVIEW.md), 
-or check out [how to train models](/resources/docs/TUTORIAL_TRAINING_OVERVIEW.md).
\ No newline at end of file
+Please have a look at the awesome Hugging Face [hub](https://huggingface.co/models) for all supported pretrained models!