diff --git a/.travis.yml b/.travis.yml index e963c4947..adb68ba8e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ matrix: - python: 3.7 env: TOXENV=py37 - python: 3.8 - env: TOXENV=py38 + env: TOXENV=py38 PYPI_RELEASE_JOB=true - python: 3.9-dev env: TOXENV=py39 @@ -41,4 +41,4 @@ deploy: on: tags: true repo: scrapinghub/dateparser - condition: "$TOXENV == py27" + condition: "$PYPI_RELEASE_JOB == true diff --git a/README.rst b/README.rst index 3bf7caa6d..20fd15545 100644 --- a/README.rst +++ b/README.rst @@ -1,486 +1,209 @@ -==================================================== -dateparser -- python parser for human readable dates -==================================================== +.. Note that we use raw HTML in the header section because centering images and paragraphs is not supported in Github (https://github.com/github/markup/issues/163) + +.. raw:: html + +

+
+ + Dateparser + +
+

+ +

Python parser for human readable dates

+ +

+ + PyPI - Downloads + + + PypI - Version + + + Code Coverage + + + Travis - Build + + + Readthedocs - Docs + +

+ +

+ Key Features • + How To Use • + Installation • + Common use cases • + You may also like... • + License +

+ + +Key Features +------------ + +- Support for almost every existing date format: absolute dates, + relative dates (``"two weeks ago"`` or ``"tomorrow"``), timestamps, + etc. +- Support for more than `200 language + locales `__. +- Language autodetection +- Customizable behavior through + `settings `__. +- Support for `non-Gregorian calendar + systems `__. +- Support for dates with timezones abbreviations or UTC offsets + (``"August 14, 2015 EST"``, ``"21 July 2013 10:15 pm +0500"``...) +- `Search + dates `__ + in longer texts. + +How To Use +---------- + +The most straightforward way to parse dates with **dateparser** is to +use the ``dateparser.parse()`` function, that wraps around most of the +functionality of the module. + +.. code:: python -.. image:: https://img.shields.io/travis/scrapinghub/dateparser/master.svg?style=flat-square - :target: https://travis-ci.org/scrapinghub/dateparser - :alt: travis build status - -.. image:: https://img.shields.io/pypi/v/dateparser.svg?style=flat-square - :target: https://pypi.python.org/pypi/dateparser - :alt: pypi version - -.. image:: https://readthedocs.org/projects/dateparser/badge/?version=latest - :target: http://dateparser.readthedocs.org/en/latest/?badge=latest - :alt: Documentation Status - -.. image:: https://codecov.io/gh/scrapinghub/dateparser/branch/master/graph/badge.svg - :target: https://codecov.io/gh/scrapinghub/dateparser - :alt: Code Coverage - -.. image:: https://badges.gitter.im/scrapinghub/dateparser.svg - :alt: Join the chat at https://gitter.im/scrapinghub/dateparser - :target: https://gitter.im/scrapinghub/dateparser?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge - - -`dateparser` provides modules to easily parse localized dates in almost -any string formats commonly found on web pages. - -.. contents:: - -Documentation -============= - -Documentation is built automatically and can be found on -`Read the Docs `_. - - -Features -======== + >>> import dateparser -* Generic parsing of dates in over 200 language locales plus numerous formats in a language agnostic fashion. -* Generic parsing of relative dates like: ``'1 min ago'``, ``'2 weeks ago'``, ``'3 months, 1 week and 1 day ago'``, ``'in 2 days'``, ``'tomorrow'``. -* Generic parsing of dates with time zones abbreviations or UTC offsets like: ``'August 14, 2015 EST'``, ``'July 4, 2013 PST'``, ``'21 July 2013 10:15 pm +0500'``. -* Date lookup in longer texts. -* Support for non-Gregorian calendar systems. See `Supported Calendars`_. -* Extensive test coverage. + >>> dateparser.parse('Fri, 12 Dec 2014 10:55:50') + datetime.datetime(2014, 12, 12, 10, 55, 50) + >>> dateparser.parse('1991-05-17') + datetime.datetime(1991, 5, 17, 0, 0) -Usage -===== + >>> dateparser.parse('In two months') # today is 1st Aug 2020 + datetime.datetime(2020, 10, 1, 11, 12, 27, 764201) -The most straightforward way is to use the `dateparser.parse <#dateparser.parse>`_ function, -that wraps around most of the functionality in the module. + >>> dateparser.parse('1484823450') # timestamp + datetime.datetime(2017, 1, 19, 10, 57, 30) -.. automodule:: dateparser - :members: parse + >>> dateparser.parse('January 12, 2012 10:00 PM EST') + datetime.datetime(2012, 1, 12, 22, 0, tzinfo=) +As you can see, **dateparser** works with different date formats, but it +can also be used directly with strings in different languages: -Popular Formats ---------------- +.. code:: python - >>> import dateparser - >>> dateparser.parse('12/12/12') - datetime.datetime(2012, 12, 12, 0, 0) - >>> dateparser.parse('Fri, 12 Dec 2014 10:55:50') - datetime.datetime(2014, 12, 12, 10, 55, 50) >>> dateparser.parse('Martes 21 de Octubre de 2014') # Spanish (Tuesday 21 October 2014) datetime.datetime(2014, 10, 21, 0, 0) + >>> dateparser.parse('Le 11 Décembre 2014 à 09:00') # French (11 December 2014 at 09:00) datetime.datetime(2014, 12, 11, 9, 0) + >>> dateparser.parse('13 января 2015 г. в 13:34') # Russian (13 January 2015 at 13:34) datetime.datetime(2015, 1, 13, 13, 34) + >>> dateparser.parse('1 เดือนตุลาคม 2005, 1:00 AM') # Thai (1 October 2005, 1:00 AM) datetime.datetime(2005, 10, 1, 1, 0) -This will try to parse a date from the given string, attempting to -detect the language each time. - -You can specify the language(s), if known, using ``languages`` argument. In this case, given languages are used and language detection is skipped: - - >>> dateparser.parse('2015, Ago 15, 1:08 pm', languages=['pt', 'es']) - datetime.datetime(2015, 8, 15, 13, 8) - -If you know the possible formats of the dates, you can -use the ``date_formats`` argument: - - >>> dateparser.parse('22 Décembre 2010', date_formats=['%d %B %Y']) - datetime.datetime(2010, 12, 22, 0, 0) - - -Relative Dates --------------- - - >>> parse('1 hour ago') - datetime.datetime(2015, 5, 31, 23, 0) - >>> parse('Il ya 2 heures') # French (2 hours ago) - datetime.datetime(2015, 5, 31, 22, 0) - >>> parse('1 anno 2 mesi') # Italian (1 year 2 months) - datetime.datetime(2014, 4, 1, 0, 0) - >>> parse('yaklaşık 23 saat önce') # Turkish (23 hours ago) - datetime.datetime(2015, 5, 31, 1, 0) - >>> parse('Hace una semana') # Spanish (a week ago) - datetime.datetime(2015, 5, 25, 0, 0) - >>> parse('2小时前') # Chinese (2 hours ago) - datetime.datetime(2015, 5, 31, 22, 0) - -.. note:: Testing above code might return different values for you depending on your environment's current date and time. - -.. note:: Support for relative dates in future needs a lot of improvement, we look forward to community's contribution to get better on that part. See `Contributing`_. - - -OOTB Language Based Date Order Preference ------------------------------------------ - - >>> # parsing ambiguous date - >>> parse('02-03-2016') # assumes english language, uses MDY date order - datetime.datetime(2016, 2, 3, 0, 0) - >>> parse('le 02-03-2016') # detects french, uses DMY date order - datetime.datetime(2016, 3, 2, 0, 0) - -.. note:: Ordering is not locale based, that's why do not expect `DMY` order for UK/Australia English. You can specify date order in that case as follows usings `Settings`_: - - >>> parse('18-12-15 06:00', settings={'DATE_ORDER': 'DMY'}) - datetime.datetime(2015, 12, 18, 6, 0) - -For more on date order, please look at `Settings`_. - - -Timezone and UTC Offset ------------------------ - -By default, `dateparser` returns tzaware `datetime` if timezone is present in date string. Otherwise, it returns a naive `datetime` object. - - >>> parse('January 12, 2012 10:00 PM EST') - datetime.datetime(2012, 1, 12, 22, 0, tzinfo=) + >>> dateparser.parse('yaklaşık 23 saat önce') # Turkish (23 hours ago), current time: 12:46 + datetime.datetime(2019, 9, 7, 13, 46) - >>> parse('January 12, 2012 10:00 PM -0500') - datetime.datetime(2012, 1, 12, 22, 0, tzinfo=) + >>> dateparser.parse('2小时前') # Chinese (2 hours ago), current time: 22:30 + datetime.datetime(2018, 5, 31, 20, 30) - >>> parse('2 hours ago EST') - datetime.datetime(2017, 3, 10, 15, 55, 39, 579667, tzinfo=) +You can control multiple behaviors by using the ``settings`` parameter: - >>> parse('2 hours ago -0500') - datetime.datetime(2017, 3, 10, 15, 59, 30, 193431, tzinfo=) +.. code:: python - If date has no timezone name/abbreviation or offset, you can specify it using `TIMEZONE` setting. + >>> dateparser.parse('2014-10-12', settings={'DATE_ORDER': 'YMD'}) + datetime.datetime(2014, 10, 12, 0, 0) - >>> parse('January 12, 2012 10:00 PM', settings={'TIMEZONE': 'US/Eastern'}) - datetime.datetime(2012, 1, 12, 22, 0) + >>> dateparser.parse('2014-10-12', settings={'DATE_ORDER': 'YDM'}) + datetime.datetime(2014, 12, 10, 0, 0) - >>> parse('January 12, 2012 10:00 PM', settings={'TIMEZONE': '+0500'}) - datetime.datetime(2012, 1, 12, 22, 0) + >>> dateparser.parse('1 year', settings={'PREFER_DATES_FROM': 'future'}) # Today is 2020-09-23 + datetime.datetime(2021, 9, 23, 0, 0) -`TIMEZONE` option may not be useful alone as it only attaches given timezone to -resultant `datetime` object. But can be useful in cases where you want conversions from and to different -timezones or when simply want a tzaware date with given timezone info attached. + >>> dateparser.parse('tomorrow', settings={'RELATIVE_BASE': datetime.datetime(1992, 1, 1)}) + datetime.datetime(1992, 1, 2, 0, 0) - >>> parse('January 12, 2012 10:00 PM', settings={'TIMEZONE': 'US/Eastern', 'RETURN_AS_TIMEZONE_AWARE': True}) - datetime.datetime(2012, 1, 12, 22, 0, tzinfo=) +To see more examples on how to use the ``settings``, check the `settings +section `__ +in the docs. +False positives +^^^^^^^^^^^^^^^ - >>> parse('10:00 am', settings={'TIMEZONE': 'EST', 'TO_TIMEZONE': 'EDT'}) - datetime.datetime(2016, 9, 25, 11, 0) +.. warning:: + **dateparser** will do its best to return a date, dealing with multiple formats and different + locales. For that reason it is important that the input is a valid date, otherwise it could + return false positives. -Some more use cases for conversion of timezones. - >>> parse('10:00 am EST', settings={'TO_TIMEZONE': 'EDT'}) # date string has timezone info - datetime.datetime(2017, 3, 12, 11, 0, tzinfo=) +To reduce the possibility of receiving false positives, make sure that: - >>> parse('now EST', settings={'TO_TIMEZONE': 'UTC'}) # relative dates - datetime.datetime(2017, 3, 10, 23, 24, 47, 371823, tzinfo=) +- The input string it's a valid date and it doesn't contain any other words or numbers. +- If you know the language or languages beforehand you add them through the ``languages`` or ``locales`` settings. -In case, no timezone is present in date string or defined in `settings`. You can still -return tzaware `datetime`. It is especially useful in case of relative dates when uncertain -what timezone is relative base. - >>> parse('2 minutes ago', settings={'RETURN_AS_TIMEZONE_AWARE': True}) - datetime.datetime(2017, 3, 11, 4, 25, 24, 152670, tzinfo=) +On the other hand, if you want to exclude any of the default parsers +(``timestamp``, ``relative-time``...) or change the order in which they +are executed, you can do so through the +`settings PARSERS `_. -In case, you want to compute relative dates in UTC instead of default system's local timezone, you can use `TIMEZONE` setting. +Installation +------------ - >>> parse('4 minutes ago', settings={'TIMEZONE': 'UTC'}) - datetime.datetime(2017, 3, 10, 23, 27, 59, 647248, tzinfo=) +Dateparser supports Python >= 3.5. You can install it by doing: -.. note:: In case, when timezone is present both in string and also specified using `settings`, string is parsed into tzaware representation and then converted to timezone specified in `settings`. +:: - >>> parse('10:40 pm PKT', settings={'TIMEZONE': 'UTC'}) - datetime.datetime(2017, 3, 12, 17, 40, tzinfo=) + $ pip install dateparser - >>> parse('20 mins ago EST', settings={'TIMEZONE': 'UTC'}) - datetime.datetime(2017, 3, 12, 21, 16, 0, 885091, tzinfo=) +If you want to use the jalali or hijri calendar, you need to install the +``calendars`` extra: -For more on timezones, please look at `Settings`_. +:: + $ pip install dateparser[calendars] -Incomplete Dates +Common use cases ---------------- - >>> from dateparser import parse - >>> parse('December 2015') # default behavior - datetime.datetime(2015, 12, 16, 0, 0) - >>> parse('December 2015', settings={'PREFER_DAY_OF_MONTH': 'last'}) - datetime.datetime(2015, 12, 31, 0, 0) - >>> parse('December 2015', settings={'PREFER_DAY_OF_MONTH': 'first'}) - datetime.datetime(2015, 12, 1, 0, 0) - - >>> parse('March') - datetime.datetime(2015, 3, 16, 0, 0) - >>> parse('March', settings={'PREFER_DATES_FROM': 'future'}) - datetime.datetime(2016, 3, 16, 0, 0) - >>> # parsing with preference set for 'past' - >>> parse('August', settings={'PREFER_DATES_FROM': 'past'}) - datetime.datetime(2015, 8, 15, 0, 0) - -You can also ignore parsing incomplete dates altogether by setting `STRICT_PARSING` flag as follows: - - >>> parse('December 2015', settings={'STRICT_PARSING': True}) - None - -For more on handling incomplete dates, please look at `Settings`_. - - -Search for Dates in Longer Chunks of Text ------------------------------------------ - -You can extract dates from longer strings of text. They are returned as list of tuples with text chunk containing the date and parsed datetime object. - -.. automodule:: dateparser.search - :members: search_dates - -Dependencies -============ - -`dateparser` relies on following libraries in some ways: - - * dateutil_'s module ``relativedelta`` for its freshness parser. - * convertdate_ to convert *Jalali* dates to *Gregorian*. - * hijri-converter_ to convert *Hijri* dates to *Gregorian*. - * tzlocal_ to reliably get local timezone. - * ruamel.yaml_ (optional) for operations on language files. - -.. _dateutil: https://pypi.python.org/pypi/python-dateutil -.. _convertdate: https://pypi.python.org/pypi/convertdate -.. _hijri-converter: https://pypi.python.org/pypi/hijri-converter -.. _tzlocal: https://pypi.python.org/pypi/tzlocal -.. _ruamel.yaml: https://pypi.python.org/pypi/ruamel.yaml - -Supported languages and locales -=============================== - -============ ================================================================ - Language Locales -============ ================================================================ -en 'en-001', 'en-150', 'en-AG', 'en-AI', 'en-AS', 'en-AT', 'en-AU', 'en-BB', 'en-BE', 'en-BI', 'en-BM', 'en-BS', 'en-BW', 'en-BZ', 'en-CA', 'en-CC', 'en-CH', 'en-CK', 'en-CM', 'en-CX', 'en-CY', 'en-DE', 'en-DG', 'en-DK', 'en-DM', 'en-ER', 'en-FI', 'en-FJ', 'en-FK', 'en-FM', 'en-GB', 'en-GD', 'en-GG', 'en-GH', 'en-GI', 'en-GM', 'en-GU', 'en-GY', 'en-HK', 'en-IE', 'en-IL', 'en-IM', 'en-IN', 'en-IO', 'en-JE', 'en-JM', 'en-KE', 'en-KI', 'en-KN', 'en-KY', 'en-LC', 'en-LR', 'en-LS', 'en-MG', 'en-MH', 'en-MO', 'en-MP', 'en-MS', 'en-MT', 'en-MU', 'en-MW', 'en-MY', 'en-NA', 'en-NF', 'en-NG', 'en-NL', 'en-NR', 'en-NU', 'en-NZ', 'en-PG', 'en-PH', 'en-PK', 'en-PN', 'en-PR', 'en-PW', 'en-RW', 'en-SB', 'en-SC', 'en-SD', 'en-SE', 'en-SG', 'en-SH', 'en-SI', 'en-SL', 'en-SS', 'en-SX', 'en-SZ', 'en-TC', 'en-TK', 'en-TO', 'en-TT', 'en-TV', 'en-TZ', 'en-UG', 'en-UM', 'en-VC', 'en-VG', 'en-VI', 'en-VU', 'en-WS', 'en-ZA', 'en-ZM', 'en-ZW' -zh -zh-Hans 'zh-Hans-HK', 'zh-Hans-MO', 'zh-Hans-SG' -hi -es 'es-419', 'es-AR', 'es-BO', 'es-BR', 'es-BZ', 'es-CL', 'es-CO', 'es-CR', 'es-CU', 'es-DO', 'es-EA', 'es-EC', 'es-GQ', 'es-GT', 'es-HN', 'es-IC', 'es-MX', 'es-NI', 'es-PA', 'es-PE', 'es-PH', 'es-PR', 'es-PY', 'es-SV', 'es-US', 'es-UY', 'es-VE' -ar 'ar-AE', 'ar-BH', 'ar-DJ', 'ar-DZ', 'ar-EG', 'ar-EH', 'ar-ER', 'ar-IL', 'ar-IQ', 'ar-JO', 'ar-KM', 'ar-KW', 'ar-LB', 'ar-LY', 'ar-MA', 'ar-MR', 'ar-OM', 'ar-PS', 'ar-QA', 'ar-SA', 'ar-SD', 'ar-SO', 'ar-SS', 'ar-SY', 'ar-TD', 'ar-TN', 'ar-YE' -bn 'bn-IN' -fr 'fr-BE', 'fr-BF', 'fr-BI', 'fr-BJ', 'fr-BL', 'fr-CA', 'fr-CD', 'fr-CF', 'fr-CG', 'fr-CH', 'fr-CI', 'fr-CM', 'fr-DJ', 'fr-DZ', 'fr-GA', 'fr-GF', 'fr-GN', 'fr-GP', 'fr-GQ', 'fr-HT', 'fr-KM', 'fr-LU', 'fr-MA', 'fr-MC', 'fr-MF', 'fr-MG', 'fr-ML', 'fr-MQ', 'fr-MR', 'fr-MU', 'fr-NC', 'fr-NE', 'fr-PF', 'fr-PM', 'fr-RE', 'fr-RW', 'fr-SC', 'fr-SN', 'fr-SY', 'fr-TD', 'fr-TG', 'fr-TN', 'fr-VU', 'fr-WF', 'fr-YT' -ur 'ur-IN' -pt 'pt-AO', 'pt-CH', 'pt-CV', 'pt-GQ', 'pt-GW', 'pt-LU', 'pt-MO', 'pt-MZ', 'pt-PT', 'pt-ST', 'pt-TL' -ru 'ru-BY', 'ru-KG', 'ru-KZ', 'ru-MD', 'ru-UA' -id -sw 'sw-CD', 'sw-KE', 'sw-UG' -pa-Arab -de 'de-AT', 'de-BE', 'de-CH', 'de-IT', 'de-LI', 'de-LU' -ja -te -mr -vi -fa 'fa-AF' -ta 'ta-LK', 'ta-MY', 'ta-SG' -tr 'tr-CY' -yue -ko 'ko-KP' -it 'it-CH', 'it-SM', 'it-VA' -fil -gu -th -kn -ps -zh-Hant 'zh-Hant-HK', 'zh-Hant-MO' -ml -or -pl -my -pa -pa-Guru -am -om 'om-KE' -ha 'ha-GH', 'ha-NE' -nl 'nl-AW', 'nl-BE', 'nl-BQ', 'nl-CW', 'nl-SR', 'nl-SX' -uk -uz -uz-Latn -yo 'yo-BJ' -ms 'ms-BN', 'ms-SG' -ig -ro 'ro-MD' -mg -ne 'ne-IN' -as -so 'so-DJ', 'so-ET', 'so-KE' -si -km -zu -cs -sv 'sv-AX', 'sv-FI' -hu -el 'el-CY' -sn -kk -rw -ckb 'ckb-IR' -qu 'qu-BO', 'qu-EC' -ak -be -ti 'ti-ER' -az -az-Latn -af 'af-NA' -ca 'ca-AD', 'ca-FR', 'ca-IT' -sr-Latn 'sr-Latn-BA', 'sr-Latn-ME', 'sr-Latn-XK' -ii -he -bg -bm -ki -gsw 'gsw-FR', 'gsw-LI' -sr -sr-Cyrl 'sr-Cyrl-BA', 'sr-Cyrl-ME', 'sr-Cyrl-XK' -ug -zgh -ff 'ff-CM', 'ff-GN', 'ff-MR' -rn -da 'da-GL' -hr 'hr-BA' -sq 'sq-MK', 'sq-XK' -sk -fi -ks -hy -nb 'nb-SJ' -luy -lg -lo -bem -kok -luo -uz-Cyrl -ka -ee 'ee-TG' -mzn -bs-Cyrl -bs -bs-Latn -kln -kam -gl -tzm -dje -kab -bo 'bo-IN' -shi-Latn -shi -shi-Tfng -mn -ln 'ln-AO', 'ln-CF', 'ln-CG' -ky -sg -lt -nyn -guz -cgg -xog -lrc 'lrc-IQ' -mer -lu -sl -teo 'teo-KE' -brx -nd -mk -uz-Arab -mas 'mas-TZ' -nn -kde -mfe -lv -seh -mgh -az-Cyrl -ga -eu -yi -ce -et -ksb -bez -ewo -fy -ebu -nus -ast -asa -ses -os 'os-RU' -br -cy -kea -lag -sah -mt -vun -rof -jmc -lb -dav -dyo -dz -nnh -is -khq -bas -naq -mua -ksh -saq -se 'se-FI', 'se-SE' -dua -rwk -mgo -sbp -to -jgo -ksf -fo 'fo-DK' -gd -kl -rm -fur -agq -haw -chr -hsb -wae -nmg -lkt -twq -dsb -yav -kw -gv -smn -eo -tl -============ ================================================================ - - -Supported Calendars -=================== -* Gregorian calendar. - -* Persian Jalali calendar. For more information, refer to `Persian Jalali Calendar `_. - - >>> from dateparser.calendars.jalali import JalaliCalendar - >>> JalaliCalendar('جمعه سی ام اسفند ۱۳۸۷').get_date() - {'date_obj': datetime.datetime(2009, 3, 20, 0, 0), 'period': 'day'} - - -* Hijri/Islamic Calendar. For more information, refer to `Hijri Calendar `_. - - >>> from dateparser.calendars.hijri import HijriCalendar - >>> HijriCalendar('17-01-1437 هـ 08:30 مساءً').get_date() - {'date_obj': datetime.datetime(2015, 10, 30, 20, 30), 'period': 'day'} - -.. note:: `HijriCalendar` only works with Python ≥ 3.6. -.. note:: For `Finnish` language, please specify `settings={'SKIP_TOKENS': []}` to correctly parse freshness dates. - - -Install using following command to use calendars. - -.. tip:: - pip install dateparser[calendars] +**dateparser** can be used with a really different number of purposes, +but it stands out when it comes to: + +Consuming data from different sources: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- **Scraping**: extract dates from different places with several + different formats and languages +- **IoT**: consuming data coming from different sources with different + date formats +- **Tooling**: consuming dates from different logs / sources +- **Format transformations**: when transforming dates coming from + different files (PDF, CSV, etc.) to other formats (database, etc). + +Offering natural interaction with users: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- **Tooling and CLI**: allow users to write “3 days ago” to retrieve + information. +- **Search engine**: allow people to search by date in an easiest / + natural format. +- **Bots**: allow users to interact with a bot easily + +You may also like... +-------------------- + +- `price-parser `__ - A + small library for extracting price and currency from raw text + strings. +- `number-parser `__ - + Library to convert numbers written in the natural language to it's + equivalent numeric forms. +- `Scrapy `__ - Web crawling and web + scraping framework + +License +------- + +`BSD +3-Clause `__ diff --git a/docs/conf.py b/docs/conf.py index 86e5d2e2a..bdb6d40f6 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,7 +29,7 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.intersphinx'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.intersphinx', 'sphinx_rtd_theme'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -65,7 +65,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) # here, relative to this directory. They are copied after the builtin @@ -117,4 +117,16 @@ ] # sphinx.ext.intersphinx confs -intersphinx_mapping = {'python': ('https://docs.python.org/2', None)} +intersphinx_mapping = {'python': ('https://docs.python.org/3', None)} + + +html_theme_options = { + 'logo_only': True, + 'collapse_navigation': True, + 'sticky_navigation': True, + 'navigation_depth': 4, + 'includehidden': True, + 'titles_only': False +} + +html_logo = "../artwork/dateparser-logo.png" diff --git a/docs/contributing.rst b/docs/contributing.rst index e582053ea..a00ca1a07 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -1 +1,2 @@ +.. _contributing: .. include:: ../CONTRIBUTING.rst diff --git a/docs/index.rst b/docs/index.rst index 736090b70..e8ae4082e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,26 +1,58 @@ -.. dateparser documentation master file, created by - sphinx-quickstart on Tue Jul 9 22:26:36 2013. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +==================================================== +dateparser -- python parser for human readable dates +==================================================== + +.. image:: https://img.shields.io/pypi/dm/dateparser + :target: https://pypi.python.org/pypi/dateparser + :alt: pypi downloads + +.. image:: https://img.shields.io/pypi/v/dateparser.svg + :target: https://pypi.python.org/pypi/dateparser + :alt: pypi version + +.. image:: https://codecov.io/gh/scrapinghub/dateparser/branch/master/graph/badge.svg + :target: https://codecov.io/gh/scrapinghub/dateparser + :alt: Code Coverage + +.. image:: https://img.shields.io/travis/scrapinghub/dateparser/master.svg + :target: https://travis-ci.org/scrapinghub/dateparser + :alt: travis build status + +.. image:: https://readthedocs.org/projects/dateparser/badge/?version=latest + :target: http://dateparser.readthedocs.org/en/latest/?badge=latest + :alt: Documentation Status + + +`dateparser` provides modules to easily parse localized dates in almost +any string formats commonly found on web pages. -.. include:: ../README.rst -.. include:: usage.rst Documentation ============= +This documentation is built automatically and can be found on +`Read the Docs `_. + + +.. include:: introduction.rst + +Indices and tables +================== + + Contents: .. toctree:: :maxdepth: 2 + introduction installation + usage + supported_locales contributing authors history -Indices and tables -================== * :ref:`genindex` * :ref:`modindex` diff --git a/docs/introduction.rst b/docs/introduction.rst new file mode 100644 index 000000000..c4ed2e3c8 --- /dev/null +++ b/docs/introduction.rst @@ -0,0 +1,246 @@ +========================== +Introduction to dateparser +========================== + + +Features +======== + +* Generic parsing of dates in over 200 language locales plus numerous formats in a language agnostic fashion. +* Generic parsing of relative dates like: ``'1 min ago'``, ``'2 weeks ago'``, ``'3 months, 1 week and 1 day ago'``, ``'in 2 days'``, ``'tomorrow'``. +* Generic parsing of dates with time zones abbreviations or UTC offsets like: ``'August 14, 2015 EST'``, ``'July 4, 2013 PST'``, ``'21 July 2013 10:15 pm +0500'``. +* Date lookup in longer texts. +* Support for non-Gregorian calendar systems. See `Supported Calendars`_. +* Extensive test coverage. + + +Basic Usage +=========== + +The most straightforward way is to use the `dateparser.parse <#dateparser.parse>`_ function, +that wraps around most of the functionality in the module. + +.. automodule:: dateparser + :members: parse + + +Popular Formats +--------------- + + >>> import dateparser + >>> dateparser.parse('12/12/12') + datetime.datetime(2012, 12, 12, 0, 0) + >>> dateparser.parse('Fri, 12 Dec 2014 10:55:50') + datetime.datetime(2014, 12, 12, 10, 55, 50) + >>> dateparser.parse('Martes 21 de Octubre de 2014') # Spanish (Tuesday 21 October 2014) + datetime.datetime(2014, 10, 21, 0, 0) + >>> dateparser.parse('Le 11 Décembre 2014 à 09:00') # French (11 December 2014 at 09:00) + datetime.datetime(2014, 12, 11, 9, 0) + >>> dateparser.parse('13 января 2015 г. в 13:34') # Russian (13 January 2015 at 13:34) + datetime.datetime(2015, 1, 13, 13, 34) + >>> dateparser.parse('1 เดือนตุลาคม 2005, 1:00 AM') # Thai (1 October 2005, 1:00 AM) + datetime.datetime(2005, 10, 1, 1, 0) + +This will try to parse a date from the given string, attempting to +detect the language each time. + +You can specify the language(s), if known, using ``languages`` argument. In this case, given languages are used and language detection is skipped: + + >>> dateparser.parse('2015, Ago 15, 1:08 pm', languages=['pt', 'es']) + datetime.datetime(2015, 8, 15, 13, 8) + +If you know the possible formats of the dates, you can +use the ``date_formats`` argument: + + >>> dateparser.parse('22 Décembre 2010', date_formats=['%d %B %Y']) + datetime.datetime(2010, 12, 22, 0, 0) + + +Relative Dates +-------------- + + >>> parse('1 hour ago') + datetime.datetime(2015, 5, 31, 23, 0) + >>> parse('Il ya 2 heures') # French (2 hours ago) + datetime.datetime(2015, 5, 31, 22, 0) + >>> parse('1 anno 2 mesi') # Italian (1 year 2 months) + datetime.datetime(2014, 4, 1, 0, 0) + >>> parse('yaklaşık 23 saat önce') # Turkish (23 hours ago) + datetime.datetime(2015, 5, 31, 1, 0) + >>> parse('Hace una semana') # Spanish (a week ago) + datetime.datetime(2015, 5, 25, 0, 0) + >>> parse('2小时前') # Chinese (2 hours ago) + datetime.datetime(2015, 5, 31, 22, 0) + +.. note:: Testing above code might return different values for you depending on your environment's current date and time. + +.. note:: Support for relative dates in future needs a lot of improvement, we look forward to community's contribution to get better on that part. See ":ref:`contributing`". + + +OOTB Language Based Date Order Preference +----------------------------------------- + + >>> # parsing ambiguous date + >>> parse('02-03-2016') # assumes english language, uses MDY date order + datetime.datetime(2016, 2, 3, 0, 0) + >>> parse('le 02-03-2016') # detects french, uses DMY date order + datetime.datetime(2016, 3, 2, 0, 0) + +.. note:: Ordering is not locale based, that's why do not expect `DMY` order for UK/Australia English. You can specify date order in that case as follows using `settings`: + + >>> parse('18-12-15 06:00', settings={'DATE_ORDER': 'DMY'}) + datetime.datetime(2015, 12, 18, 6, 0) + +For more on date order, please look at Settings. + + + +Timezone and UTC Offset +----------------------- + +By default, `dateparser` returns tzaware `datetime` if timezone is present in date string. Otherwise, it returns a naive `datetime` object. + + >>> parse('January 12, 2012 10:00 PM EST') + datetime.datetime(2012, 1, 12, 22, 0, tzinfo=) + + >>> parse('January 12, 2012 10:00 PM -0500') + datetime.datetime(2012, 1, 12, 22, 0, tzinfo=) + + >>> parse('2 hours ago EST') + datetime.datetime(2017, 3, 10, 15, 55, 39, 579667, tzinfo=) + + >>> parse('2 hours ago -0500') + datetime.datetime(2017, 3, 10, 15, 59, 30, 193431, tzinfo=) + + If date has no timezone name/abbreviation or offset, you can specify it using `TIMEZONE` setting. + + >>> parse('January 12, 2012 10:00 PM', settings={'TIMEZONE': 'US/Eastern'}) + datetime.datetime(2012, 1, 12, 22, 0) + + >>> parse('January 12, 2012 10:00 PM', settings={'TIMEZONE': '+0500'}) + datetime.datetime(2012, 1, 12, 22, 0) + +`TIMEZONE` option may not be useful alone as it only attaches given timezone to +resultant `datetime` object. But can be useful in cases where you want conversions from and to different +timezones or when simply want a tzaware date with given timezone info attached. + + >>> parse('January 12, 2012 10:00 PM', settings={'TIMEZONE': 'US/Eastern', 'RETURN_AS_TIMEZONE_AWARE': True}) + datetime.datetime(2012, 1, 12, 22, 0, tzinfo=) + + + >>> parse('10:00 am', settings={'TIMEZONE': 'EST', 'TO_TIMEZONE': 'EDT'}) + datetime.datetime(2016, 9, 25, 11, 0) + +Some more use cases for conversion of timezones. + + >>> parse('10:00 am EST', settings={'TO_TIMEZONE': 'EDT'}) # date string has timezone info + datetime.datetime(2017, 3, 12, 11, 0, tzinfo=) + + >>> parse('now EST', settings={'TO_TIMEZONE': 'UTC'}) # relative dates + datetime.datetime(2017, 3, 10, 23, 24, 47, 371823, tzinfo=) + +In case, no timezone is present in date string or defined in `settings`. You can still +return tzaware `datetime`. It is especially useful in case of relative dates when uncertain +what timezone is relative base. + + >>> parse('2 minutes ago', settings={'RETURN_AS_TIMEZONE_AWARE': True}) + datetime.datetime(2017, 3, 11, 4, 25, 24, 152670, tzinfo=) + +In case, you want to compute relative dates in UTC instead of default system's local timezone, you can use `TIMEZONE` setting. + + >>> parse('4 minutes ago', settings={'TIMEZONE': 'UTC'}) + datetime.datetime(2017, 3, 10, 23, 27, 59, 647248, tzinfo=) + +.. note:: In case, when timezone is present both in string and also specified using `settings`, string is parsed into tzaware representation and then converted to timezone specified in `settings`. + + >>> parse('10:40 pm PKT', settings={'TIMEZONE': 'UTC'}) + datetime.datetime(2017, 3, 12, 17, 40, tzinfo=) + + >>> parse('20 mins ago EST', settings={'TIMEZONE': 'UTC'}) + datetime.datetime(2017, 3, 12, 21, 16, 0, 885091, tzinfo=) + +For more on timezones, please look at Settings. + + +Incomplete Dates +---------------- + + >>> from dateparser import parse + >>> parse('December 2015') # default behavior + datetime.datetime(2015, 12, 16, 0, 0) + >>> parse('December 2015', settings={'PREFER_DAY_OF_MONTH': 'last'}) + datetime.datetime(2015, 12, 31, 0, 0) + >>> parse('December 2015', settings={'PREFER_DAY_OF_MONTH': 'first'}) + datetime.datetime(2015, 12, 1, 0, 0) + + >>> parse('March') + datetime.datetime(2015, 3, 16, 0, 0) + >>> parse('March', settings={'PREFER_DATES_FROM': 'future'}) + datetime.datetime(2016, 3, 16, 0, 0) + >>> # parsing with preference set for 'past' + >>> parse('August', settings={'PREFER_DATES_FROM': 'past'}) + datetime.datetime(2015, 8, 15, 0, 0) + +You can also ignore parsing incomplete dates altogether by setting `STRICT_PARSING` flag as follows: + + >>> parse('December 2015', settings={'STRICT_PARSING': True}) + None + +For more on handling incomplete dates, please look at Settings. + + +Search for Dates in Longer Chunks of Text +----------------------------------------- + +You can extract dates from longer strings of text. They are returned as list of tuples with text chunk containing the date and parsed datetime object. + +.. automodule:: dateparser.search + :members: search_dates + +Dependencies +============ + +`dateparser` relies on following libraries in some ways: + + * dateutil_'s module ``relativedelta`` for its freshness parser. + * convertdate_ to convert *Jalali* dates to *Gregorian*. + * hijri-converter_ to convert *Hijri* dates to *Gregorian*. + * tzlocal_ to reliably get local timezone. + * ruamel.yaml_ (optional) for operations on language files. + +.. _dateutil: https://pypi.python.org/pypi/python-dateutil +.. _convertdate: https://pypi.python.org/pypi/convertdate +.. _hijri-converter: https://pypi.python.org/pypi/hijri-converter +.. _tzlocal: https://pypi.python.org/pypi/tzlocal +.. _ruamel.yaml: https://pypi.python.org/pypi/ruamel.yaml + +Supported languages and locales +=============================== +You can check the supported locales by visiting the ":ref:`supported-locales`" section. + + +Supported Calendars +=================== +* Gregorian calendar. + +* Persian Jalali calendar. For more information, refer to `Persian Jalali Calendar `_. + + >>> from dateparser.calendars.jalali import JalaliCalendar + >>> JalaliCalendar('جمعه سی ام اسفند ۱۳۸۷').get_date() + {'date_obj': datetime.datetime(2009, 3, 20, 0, 0), 'period': 'day'} + + +* Hijri/Islamic Calendar. For more information, refer to `Hijri Calendar `_. + + >>> from dateparser.calendars.hijri import HijriCalendar + >>> HijriCalendar('17-01-1437 هـ 08:30 مساءً').get_date() + {'date_obj': datetime.datetime(2015, 10, 30, 20, 30), 'period': 'day'} + +.. note:: `HijriCalendar` only works with Python ≥ 3.6. +.. note:: For `Finnish` language, please specify `settings={'SKIP_TOKENS': []}` to correctly parse freshness dates. + + +Install using following command to use calendars. + +.. tip:: + pip install dateparser[calendars] diff --git a/docs/supported_locales.rst b/docs/supported_locales.rst new file mode 100644 index 000000000..dbd5cb615 --- /dev/null +++ b/docs/supported_locales.rst @@ -0,0 +1,214 @@ +.. _supported-locales: + +Supported languages and locales +=============================== + +============ ================================================================ + Language Locales +============ ================================================================ +en 'en-001', 'en-150', 'en-AG', 'en-AI', 'en-AS', 'en-AT', 'en-AU', 'en-BB', 'en-BE', 'en-BI', 'en-BM', 'en-BS', 'en-BW', 'en-BZ', 'en-CA', 'en-CC', 'en-CH', 'en-CK', 'en-CM', 'en-CX', 'en-CY', 'en-DE', 'en-DG', 'en-DK', 'en-DM', 'en-ER', 'en-FI', 'en-FJ', 'en-FK', 'en-FM', 'en-GB', 'en-GD', 'en-GG', 'en-GH', 'en-GI', 'en-GM', 'en-GU', 'en-GY', 'en-HK', 'en-IE', 'en-IL', 'en-IM', 'en-IN', 'en-IO', 'en-JE', 'en-JM', 'en-KE', 'en-KI', 'en-KN', 'en-KY', 'en-LC', 'en-LR', 'en-LS', 'en-MG', 'en-MH', 'en-MO', 'en-MP', 'en-MS', 'en-MT', 'en-MU', 'en-MW', 'en-MY', 'en-NA', 'en-NF', 'en-NG', 'en-NL', 'en-NR', 'en-NU', 'en-NZ', 'en-PG', 'en-PH', 'en-PK', 'en-PN', 'en-PR', 'en-PW', 'en-RW', 'en-SB', 'en-SC', 'en-SD', 'en-SE', 'en-SG', 'en-SH', 'en-SI', 'en-SL', 'en-SS', 'en-SX', 'en-SZ', 'en-TC', 'en-TK', 'en-TO', 'en-TT', 'en-TV', 'en-TZ', 'en-UG', 'en-UM', 'en-VC', 'en-VG', 'en-VI', 'en-VU', 'en-WS', 'en-ZA', 'en-ZM', 'en-ZW' +zh +zh-Hans 'zh-Hans-HK', 'zh-Hans-MO', 'zh-Hans-SG' +hi +es 'es-419', 'es-AR', 'es-BO', 'es-BR', 'es-BZ', 'es-CL', 'es-CO', 'es-CR', 'es-CU', 'es-DO', 'es-EA', 'es-EC', 'es-GQ', 'es-GT', 'es-HN', 'es-IC', 'es-MX', 'es-NI', 'es-PA', 'es-PE', 'es-PH', 'es-PR', 'es-PY', 'es-SV', 'es-US', 'es-UY', 'es-VE' +ar 'ar-AE', 'ar-BH', 'ar-DJ', 'ar-DZ', 'ar-EG', 'ar-EH', 'ar-ER', 'ar-IL', 'ar-IQ', 'ar-JO', 'ar-KM', 'ar-KW', 'ar-LB', 'ar-LY', 'ar-MA', 'ar-MR', 'ar-OM', 'ar-PS', 'ar-QA', 'ar-SA', 'ar-SD', 'ar-SO', 'ar-SS', 'ar-SY', 'ar-TD', 'ar-TN', 'ar-YE' +bn 'bn-IN' +fr 'fr-BE', 'fr-BF', 'fr-BI', 'fr-BJ', 'fr-BL', 'fr-CA', 'fr-CD', 'fr-CF', 'fr-CG', 'fr-CH', 'fr-CI', 'fr-CM', 'fr-DJ', 'fr-DZ', 'fr-GA', 'fr-GF', 'fr-GN', 'fr-GP', 'fr-GQ', 'fr-HT', 'fr-KM', 'fr-LU', 'fr-MA', 'fr-MC', 'fr-MF', 'fr-MG', 'fr-ML', 'fr-MQ', 'fr-MR', 'fr-MU', 'fr-NC', 'fr-NE', 'fr-PF', 'fr-PM', 'fr-RE', 'fr-RW', 'fr-SC', 'fr-SN', 'fr-SY', 'fr-TD', 'fr-TG', 'fr-TN', 'fr-VU', 'fr-WF', 'fr-YT' +ur 'ur-IN' +pt 'pt-AO', 'pt-CH', 'pt-CV', 'pt-GQ', 'pt-GW', 'pt-LU', 'pt-MO', 'pt-MZ', 'pt-PT', 'pt-ST', 'pt-TL' +ru 'ru-BY', 'ru-KG', 'ru-KZ', 'ru-MD', 'ru-UA' +id +sw 'sw-CD', 'sw-KE', 'sw-UG' +pa-Arab +de 'de-AT', 'de-BE', 'de-CH', 'de-IT', 'de-LI', 'de-LU' +ja +te +mr +vi +fa 'fa-AF' +ta 'ta-LK', 'ta-MY', 'ta-SG' +tr 'tr-CY' +yue +ko 'ko-KP' +it 'it-CH', 'it-SM', 'it-VA' +fil +gu +th +kn +ps +zh-Hant 'zh-Hant-HK', 'zh-Hant-MO' +ml +or +pl +my +pa +pa-Guru +am +om 'om-KE' +ha 'ha-GH', 'ha-NE' +nl 'nl-AW', 'nl-BE', 'nl-BQ', 'nl-CW', 'nl-SR', 'nl-SX' +uk +uz +uz-Latn +yo 'yo-BJ' +ms 'ms-BN', 'ms-SG' +ig +ro 'ro-MD' +mg +ne 'ne-IN' +as +so 'so-DJ', 'so-ET', 'so-KE' +si +km +zu +cs +sv 'sv-AX', 'sv-FI' +hu +el 'el-CY' +sn +kk +rw +ckb 'ckb-IR' +qu 'qu-BO', 'qu-EC' +ak +be +ti 'ti-ER' +az +az-Latn +af 'af-NA' +ca 'ca-AD', 'ca-FR', 'ca-IT' +sr-Latn 'sr-Latn-BA', 'sr-Latn-ME', 'sr-Latn-XK' +ii +he +bg +bm +ki +gsw 'gsw-FR', 'gsw-LI' +sr +sr-Cyrl 'sr-Cyrl-BA', 'sr-Cyrl-ME', 'sr-Cyrl-XK' +ug +zgh +ff 'ff-CM', 'ff-GN', 'ff-MR' +rn +da 'da-GL' +hr 'hr-BA' +sq 'sq-MK', 'sq-XK' +sk +fi +ks +hy +nb 'nb-SJ' +luy +lg +lo +bem +kok +luo +uz-Cyrl +ka +ee 'ee-TG' +mzn +bs-Cyrl +bs +bs-Latn +kln +kam +gl +tzm +dje +kab +bo 'bo-IN' +shi-Latn +shi +shi-Tfng +mn +ln 'ln-AO', 'ln-CF', 'ln-CG' +ky +sg +lt +nyn +guz +cgg +xog +lrc 'lrc-IQ' +mer +lu +sl +teo 'teo-KE' +brx +nd +mk +uz-Arab +mas 'mas-TZ' +nn +kde +mfe +lv +seh +mgh +az-Cyrl +ga +eu +yi +ce +et +ksb +bez +ewo +fy +ebu +nus +ast +asa +ses +os 'os-RU' +br +cy +kea +lag +sah +mt +vun +rof +jmc +lb +dav +dyo +dz +nnh +is +khq +bas +naq +mua +ksh +saq +se 'se-FI', 'se-SE' +dua +rwk +mgo +sbp +to +jgo +ksf +fo 'fo-DK' +gd +kl +rm +fur +agq +haw +chr +hsb +wae +nmg +lkt +twq +dsb +yav +kw +gv +smn +eo +tl +============ ================================================================ diff --git a/setup.py b/setup.py index 964a806a5..9cf2f3099 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,10 @@ import re from setuptools import setup, find_packages -open_as_utf = lambda x: open(x, encoding='utf-8') - -(__version__, ) = re.findall(r"__version__.*\s*=\s*[']([^']+)[']", - open('dateparser/__init__.py').read()) - -readme = re.sub(r':members:.+|..\sautomodule::.+|:class:|:func:', '', open_as_utf('README.rst').read()) -readme = re.sub(r'`Settings`_', '`Settings`', readme) -readme = re.sub(r'`Contributing`_', '`Contributing`', readme) -history = re.sub(r':mod:|:class:|:func:', '', open_as_utf('HISTORY.rst').read()) +__version__ = re.match(r"__version__.*\s*=\s*[']([^']+)[']", open('dateparser/__init__.py').read())[1] +introduction = re.sub(r':members:.+|..\sautomodule::.+|:class:|:func:|:ref:', '', open('docs/introduction.rst').read()) +history = re.sub(r':mod:|:class:|:func:', '', open('HISTORY.rst').read()) test_requirements = open('tests/requirements.txt').read().splitlines() @@ -18,7 +12,7 @@ name='dateparser', version=__version__, description='Date parsing library designed to parse dates from HTML pages', - long_description=readme + '\n\n' + history, + long_description=introduction + '\n\n' + history, author='Scrapinghub', author_email='info@scrapinghub.com', url='https://github.com/scrapinghub/dateparser', diff --git a/tox.ini b/tox.ini index 22c5282bd..2b4e86b07 100644 --- a/tox.ini +++ b/tox.ini @@ -23,5 +23,6 @@ changedir = docs deps = {[testenv]deps} sphinx + sphinx-rtd-theme commands = sphinx-build -b html . {envtmpdir}/html