diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 89df8cb..17a1c7a 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -8,6 +8,8 @@ on: - 'undate/**' - 'tests/**' pull_request: + branches: + - "**" env: # python version used to calculate and submit code coverage diff --git a/README.md b/README.md index 37b8452..9c8e898 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ An `UndateInterval` is a date range between two `Undate` objects. Intervals can ``` You can initialize `Undate` or `UndateInterval` objects by parsing a date string with a specific converter, and you can also output an `Undate` object in those formats. -Available converters are "ISO8601" and "EDTF" (but only) +Currently available converters are "ISO8601" and "EDTF" and supported calendars. ```python >>> from undate import Undate @@ -156,6 +156,33 @@ Available converters are "ISO8601" and "EDTF" (but only) ``` +### Calendars + +All `Undate` objects are calendar aware, and date converters include support for parsing and working with dates from other calendars. The Gregorian calendar is used by default; currently `undate` supports the Hijri Islamic calendar and the Anno Mundi Hebrew calendar based on calendar convertion logic implemented in the [convertdate](https://convertdate.readthedocs.io/en/latest/)package. + +Dates are stored with the year, month, day and appropriate precision for the original calendar; internally, earliest and latest dates are calculated in Gregorian / Proleptic Gregorian calendar for standardized comparison across dates from different calendars. + +```python +>>> from undate import Undate +>>> tammuz4816 = Undate.parse("26 Tammuz 4816", "Hebrew") +>>> tammuz4816 + +>>> rajab495 = Undate.parse("Rajab 495", "Hijri") +>>> rajab495 + +>>> y2k = Undate.parse("2001", "EDTF") +>>> y2k + +>>> [str(d.earliest) for d in [rajab495, tammuz4816, y2k]] +['1102-04-28', '1056-07-17', '2001-01-01'] +>>> [str(d.precision) for d in [rajab495, tammuz4816, y2k]] +['MONTH', 'DAY', 'YEAR'] +>>> sorted([rajab495, tammuz4816, y2k]) +[, , ] +``` + +* * * + For more examples, refer to the [example notebooks](https://github.com/dh-tech/undate-python/tree/main/examples/notebooks/) included in this repository. ## Documentation diff --git a/docs/undate/converters.rst b/docs/undate/converters.rst index 701aaf1..57e90a1 100644 --- a/docs/undate/converters.rst +++ b/docs/undate/converters.rst @@ -1,19 +1,25 @@ Converters ========== +Overview +-------- + .. automodule:: undate.converters.base :members: :undoc-members: +Formats +-------- + ISO8601 -------- +^^^^^^^ .. automodule:: undate.converters.iso8601 :members: :undoc-members: Extended Date-Time Format (EDTF) --------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. automodule:: undate.converters.edtf.converter :members: @@ -23,8 +29,25 @@ Extended Date-Time Format (EDTF) :members: :undoc-members: -.. transformer is more of an internal, probably doesn't make sense to include -.. .. automodule:: undate.converters.edtf.transformer -.. :members: -.. :undoc-members: + +Calendars +--------- + +Gregorian +^^^^^^^^^ + +.. automodule:: undate.converters.calendars.gregorian + :members: + +Hijri (Islamic calendar) +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: undate.converters.calendars.hijri.converter + :members: + +Anno Mundi (Hebrew calendar) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: undate.converters.calendars.hebrew.converter + :members: diff --git a/pyproject.toml b/pyproject.toml index 9179ca0..f1ad9a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ readme = "README.md" license = { text = "Apache-2" } requires-python = ">= 3.9" dynamic = ["version"] -dependencies = ["lark", "numpy"] +dependencies = ["lark[interegular]", "numpy", "convertdate", "strenum; python_version < '3.11'"] authors = [ { name = "Rebecca Sutton Koeser" }, { name = "Cole Crawford" }, diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index 02cf820..5fefe49 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -1,10 +1,11 @@ """ -:class:`undate.converters.BaseDateConverter` provides a base class for +:class:`~undate.converters.BaseDateConverter` provides a base class for implementing date converters, which can provide support for -parsing and generating dates in different formats and also converting -dates between different calendars. +parsing and generating dates in different formats. +The converter subclass :class:`undate.converters.BaseCalendarConverter` +provides additional functionaly needed for calendar conversion. -To add support for a new date format or calendar conversion: +To add support for a new date converter: - Create a new file under ``undate/converters/`` - For converters with sufficient complexity, you may want to create a submodule; @@ -18,6 +19,26 @@ The new subclass should be loaded automatically and included in the converters returned by :meth:`BaseDateConverter.available_converters` +To add support for a new calendar converter: + +- Create a new file under ``undate/converters/calendars/`` + - For converters with sufficient complexity, you may want to create a submodule; + see ``undate.converters.calendars.hijri`` for an example. +- Extend ``BaseCalendarConverter`` and implement ``parse`` and ``to_string`` + formatter methods as desired/appropriate for your converter as well as the + additional methods for ``max_month``, ``max_day``, and convertion ``to_gregorian`` + calendar. +- Import your calendar in ``undate/converters/calendars/__init__.py`` and include in `__all__`` +- Add unit tests for the new calendar logic under ``tests/test_converters/calendars/`` +- Add the new calendar to the ``Calendar`` enum of supported calendars in + ``undate/undate.py`` and confirm that the `get_converter` method loads your + calendar converter correctly (an existing unit test should cover this). +- Consider creating a notebook to demonstrate the use of the calendar + converter. + +Calendar converter subclasses are also automatically loaded and included +in the list of available converters. + ------------------- """ @@ -90,6 +111,54 @@ def available_converters(cls) -> Dict[str, Type["BaseDateConverter"]]: """ Dictionary of available converters keyed on name. """ + return {c.name: c for c in cls.subclasses()} # type: ignore + + @classmethod + def subclasses(cls) -> list[Type["BaseDateConverter"]]: + """ + List of available converters classes. Includes calendar convert + subclasses. + """ # ensure undate converters are imported cls.import_converters() - return {c.name: c for c in cls.__subclasses__()} # type: ignore + + # find all direct subclasses, excluding base calendar converter + subclasses = cls.__subclasses__() + subclasses.remove(BaseCalendarConverter) + # add all subclasses of calendar converter base class + subclasses.extend(BaseCalendarConverter.__subclasses__()) + return subclasses + + +class BaseCalendarConverter(BaseDateConverter): + """Base class for calendar converters, with additional methods required + for calendars.""" + + #: Converter name. Subclasses must define a unique name. + name: str = "Base Calendar Converter" + + def min_month(self) -> int: + """Smallest numeric month for this calendar.""" + raise NotImplementedError + + def max_month(self, year: int) -> int: + """Maximum numeric month for this calendar""" + raise NotImplementedError + + def first_month(self) -> int: + """first month in this calendar; by default, returns :meth:`min_month`.""" + return self.min_month() + + def last_month(self, year: int) -> int: + """last month in this calendar; by default, returns :meth:`max_month`.""" + return self.max_month(year) + + def max_day(self, year: int, month: int) -> int: + """maximum numeric day for the specified year and month in this calendar""" + raise NotImplementedError + + def to_gregorian(self, year, month, day) -> tuple[int, int, int]: + """Convert a date for this calendar specified by numeric year, month, and day, + into the Gregorian equivalent date. Should return a tuple of year, month, day. + """ + raise NotImplementedError diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py new file mode 100644 index 0000000..c14e115 --- /dev/null +++ b/src/undate/converters/calendars/__init__.py @@ -0,0 +1,5 @@ +from undate.converters.calendars.gregorian import GregorianDateConverter +from undate.converters.calendars.hijri import HijriDateConverter +from undate.converters.calendars.hebrew import HebrewDateConverter + +__all__ = ["HijriDateConverter", "GregorianDateConverter", "HebrewDateConverter"] diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py new file mode 100644 index 0000000..5a1d2dc --- /dev/null +++ b/src/undate/converters/calendars/gregorian.py @@ -0,0 +1,51 @@ +from calendar import monthrange + +from undate.converters.base import BaseCalendarConverter + + +class GregorianDateConverter(BaseCalendarConverter): + """ + Calendar converter class for Gregorian calendar. + """ + + #: converter name: Gregorian + name: str = "Gregorian" + #: calendar + calendar_name: str = "Gregorian" + + #: known non-leap year + NON_LEAP_YEAR: int = 2022 + + def min_month(self) -> int: + """First month for the Gregorian calendar.""" + return 1 + + def max_month(self, year: int) -> int: + """maximum numeric month for the specified year in the Gregorian calendar""" + return 12 + + def max_day(self, year: int, month: int) -> int: + """maximum numeric day for the specified year and month in this calendar""" + # if month is known, use that to calculate + if month: + # if year is known, use it; otherwise use a known non-leap year + # (only matters for February) + year = year or self.NON_LEAP_YEAR + + # Use monthrange from python builtin calendar module. + # returns first day of the month and number of days in the month + # for the specified year and month. + _, max_day = monthrange(year, month) + else: + # if year and month are unknown, return maximum possible + max_day = 31 + + return max_day + + def to_gregorian(self, year, month, day) -> tuple[int, int, int]: + """Convert to Gregorian date. This returns the specified by year, month, + and day unchanged, but is provided for consistency since all calendar + converters need to support conversion to Gregorian calendar for + a common point of comparison. + """ + return (year, month, day) diff --git a/src/undate/converters/calendars/hebrew/__init__.py b/src/undate/converters/calendars/hebrew/__init__.py new file mode 100644 index 0000000..e612ce3 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/__init__.py @@ -0,0 +1,3 @@ +from undate.converters.calendars.hebrew.converter import HebrewDateConverter + +__all__ = ["HebrewDateConverter"] diff --git a/src/undate/converters/calendars/hebrew/converter.py b/src/undate/converters/calendars/hebrew/converter.py new file mode 100644 index 0000000..b8b4620 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/converter.py @@ -0,0 +1,78 @@ +from typing import Union + +from convertdate import hebrew # type: ignore +from lark.exceptions import UnexpectedCharacters + +from undate.converters.base import BaseCalendarConverter +from undate.converters.calendars.hebrew.parser import hebrew_parser +from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer +from undate.undate import Undate, UndateInterval + + +class HebrewDateConverter(BaseCalendarConverter): + """ + Converter for Hebrew Anno Mundicalendar. + + Support for parsing Anno Mundi dates and converting to Undate and UndateInterval + objects in the Gregorian calendar. + """ + + #: converter name: Hebrew + name: str = "Hebrew" + calendar_name: str = "Anno Mundi" + + def __init__(self): + self.transformer = HebrewDateTransformer() + + def min_month(self) -> int: + """Smallest numeric month for this calendar.""" + return 1 + + def max_month(self, year: int) -> int: + """Maximum numeric month for this calendar. In Hebrew calendar, this is 12 or 13 + depending on whether it is a leap year.""" + return hebrew.year_months(year) + + def first_month(self) -> int: + """First month in this calendar. The Hebrew civil year starts in Tishri.""" + return hebrew.TISHRI + + def last_month(self, year: int) -> int: + """Last month in this calendar. Hebrew civil year starts in Tishri, + Elul is the month before Tishri.""" + return hebrew.ELUL + + def max_day(self, year: int, month: int) -> int: + """maximum numeric day for the specified year and month in this calendar""" + # NOTE: unreleased v2.4.1 of convertdate standardizes month_days to month_length + return hebrew.month_days(year, month) + + def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: + """Convert a Hebrew date, specified by year, month, and day, + to the Gregorian equivalent date. Returns a tuple of year, month, day. + """ + return hebrew.to_gregorian(year, month, day) + + def parse(self, value: str) -> Union[Undate, UndateInterval]: + """ + Parse a Hebrew date string and return an :class:`~undate.undate.Undate` or + :class:`~undate.undate.UndateInterval`. + The Hebrew date string is preserved in the undate label. + """ + if not value: + raise ValueError("Parsing empty string is not supported") + + # parse the input string, then transform to undate object + try: + # parse the string with our Hebrew date parser + parsetree = hebrew_parser.parse(value) + # transform the parse tree into an undate or undate interval + undate_obj = self.transformer.transform(parsetree) + # set the original date as a label, with the calendar name + undate_obj.label = f"{value} {self.calendar_name}" + return undate_obj + except UnexpectedCharacters as err: + raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err + + # do we need to support conversion the other direction? + # i.e., generate a Hebrew date from an abitrary undate or undate interval? diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark new file mode 100644 index 0000000..b55ec3f --- /dev/null +++ b/src/undate/converters/calendars/hebrew/hebrew.lark @@ -0,0 +1,56 @@ +%import common.WS +%ignore WS + +// only support day month year format for now +// parser requires numeric day and year to be distinguished based on order +hebrew_date: day month year | month year | year + +// TODO: handle date ranges? + +// TODO: add support for qualifiers? +// PGP dates use qualifiers like "first decade of" (for beginning of month) +// "first third of", seasons (can look for more examples) + +// Hebrew calendar starts with year 1 in 3761 BCE +year: /\d+/ + +// months +month: month_1 + | month_2 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 + | month_13 +// months have 29 or 30 days; we do not expect leading zeroes +day: /[1-9]/ | /[12][0-9]/ | /30/ + +// months, in order; from convertdate list +// with variants from Princeton Geniza Project +// support matching with and without accents +month_1: "Nisan" +// Iyar or Iyyar +month_2: /Iyy?ar/ +month_3: "Sivan" +month_4: "Tammuz" +month_5: "Av" +month_6: "Elul" +// Tishrei or Tishri +month_7: /Tishre?i/ +month_8: "Heshvan" +month_9: "Kislev" +// Tevet or Teveth +month_10: /[ṬT]eveth?/ +month_11: "Shevat" +// Adar I or Adar +month_12: /Adar( I)?/ +// Adar II or Adar Bet +month_13: /Adar (II|Bet)/ + + diff --git a/src/undate/converters/calendars/hebrew/parser.py b/src/undate/converters/calendars/hebrew/parser.py new file mode 100644 index 0000000..5654f60 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/parser.py @@ -0,0 +1,9 @@ +import pathlib + +from lark import Lark + +grammar_path = pathlib.Path(__file__).parent / "hebrew.lark" + +with open(grammar_path) as grammar: + # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates + hebrew_parser = Lark(grammar.read(), start="hebrew_date", strict=True) diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py new file mode 100644 index 0000000..a6d2888 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/transformer.py @@ -0,0 +1,40 @@ +from lark import Transformer, Tree + +from undate.undate import Undate, Calendar + + +class HebrewUndate(Undate): + """Undate convience subclass; sets default calendar to Hebrew.""" + + calendar = Calendar.HEBREW + + +class HebrewDateTransformer(Transformer): + """Transform a Hebrew date parse tree and return an Undate or + UndateInterval.""" + + def hebrew_date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one integer value; + # anonymous tokens convert to their value and cast as int + value = int(child.children[0]) + parts[str(child.data)] = value + + # initialize and return an undate with islamic year, month, day and + # islamic calendar + return HebrewUndate(**parts) + + # year translation is not needed since we want a tree with name year + # this is equivalent to a no-op + # def year(self, items): + # return Tree(data="year", children=[items[0]]) + + def month(self, items): + # month has a nested tree for the rule and the value + # the name of the rule (month_1, month_2, etc) gives us the + # number of the month needed for converting the date + tree = items[0] + month_n = tree.data.split("_")[-1] + return Tree(data="month", children=[month_n]) diff --git a/src/undate/converters/calendars/hijri/__init__.py b/src/undate/converters/calendars/hijri/__init__.py new file mode 100644 index 0000000..4ac5b4b --- /dev/null +++ b/src/undate/converters/calendars/hijri/__init__.py @@ -0,0 +1,3 @@ +from undate.converters.calendars.hijri.converter import HijriDateConverter + +__all__ = ["HijriDateConverter"] diff --git a/src/undate/converters/calendars/hijri/converter.py b/src/undate/converters/calendars/hijri/converter.py new file mode 100644 index 0000000..b4b81b1 --- /dev/null +++ b/src/undate/converters/calendars/hijri/converter.py @@ -0,0 +1,67 @@ +from typing import Union + +from convertdate import islamic # type: ignore +from lark.exceptions import UnexpectedCharacters + +from undate.converters.base import BaseCalendarConverter +from undate.converters.calendars.hijri.parser import hijri_parser +from undate.converters.calendars.hijri.transformer import HijriDateTransformer +from undate.undate import Undate, UndateInterval + + +class HijriDateConverter(BaseCalendarConverter): + """ + Converter for Hijri / Islamic calendar. + + Support for parsing Hijri dates and converting to Undate and UndateInterval + objects in the Gregorian calendar. + """ + + #: converter name: Hijri + name: str = "Hijri" + calendar_name: str = "Hijrī" + + def __init__(self): + self.transformer = HijriDateTransformer() + + def max_day(self, year: int, month: int) -> int: + """maximum numeric day for the specified year and month in this calendar""" + return islamic.month_length(year, month) + + def min_month(self) -> int: + """smallest numeric month for this calendar.""" + return 1 + + def max_month(self, year: int) -> int: + """maximum numeric month for this calendar""" + return 12 + + def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: + """Convert a Hijri date, specified by year, month, and day, + to the Gregorian equivalent date. Returns a tuple of year, month, day. + """ + return islamic.to_gregorian(year, month, day) + + def parse(self, value: str) -> Union[Undate, UndateInterval]: + """ + Parse a Hijri date string and return an :class:`~undate.undate.Undate` or + :class:`~undate.undate.UndateInterval`. + The Hijri date string is preserved in the undate label. + """ + if not value: + raise ValueError("Parsing empty string is not supported") + + # parse the input string, then transform to undate object + try: + # parse the string with our Hijri date parser + parsetree = hijri_parser.parse(value) + # transform the parse tree into an undate or undate interval + undate_obj = self.transformer.transform(parsetree) + # set the original date as a label, with the calendar name + undate_obj.label = f"{value} {self.calendar_name}" + return undate_obj + except UnexpectedCharacters as err: + raise ValueError(f"Could not parse '{value}' as a Hijri date") from err + + # do we need to support conversion the other direction? + # i.e., generate a Hijri date from an abitrary undate or undate interval? diff --git a/src/undate/converters/calendars/hijri/hijri.lark b/src/undate/converters/calendars/hijri/hijri.lark new file mode 100644 index 0000000..4e6ccc7 --- /dev/null +++ b/src/undate/converters/calendars/hijri/hijri.lark @@ -0,0 +1,56 @@ +%import common.WS +%ignore WS + +// only support day month year format for now +// parser requires numeric day and year to be distinguished based on order +hijri_date: day month year | month year | year + +// TODO: handle date ranges? + +// TODO: add support for qualifiers? +// PGP dates use qualifiers like "first decade of" (for beginning of month) +// "first third of", seasons (can look for more examples) + +year: /\d+/ + +// months +month: month_1 + | month_2 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 +// months have 29 or 30 days; we do not expect leading zeroes +day: /[1-9]/ | /[12][0-9]/ | /30/ + +// months, in order; from convertdate list +// with variants from Princeton Geniza Project +// support matching with and without accents +// al-Muḥarram or Muḥarram +month_1: /(al-)?Mu[ḥh]arram/ +month_2: /[ṢS]afar/ +// Rabīʿ al-ʾAwwal or Rabi' I +month_3: /Rab[īi][ʿ'] (al-[`ʾ]Awwal|I)/ +// Rabīʿ ath-Thānī or Rabi' II +month_4: /Rab[īi][ʿ'] (ath-Th[āa]n[īi]|II)/ +// Jumādā al-ʾAwwal or Jumādā I +month_5: /Jum[āa]d[āa] (al-[ʾ`]Awwal|I)/ +// Jumādā ath-Thāniya or Jumādā II +month_6: /Jum[āa]d[āa] (ath-Th[āa]niyah|II)/ +month_7: "Rajab" +// Shaʿbān +month_8: /Sha[ʿ']b[āa]n/ +month_9: /Rama[ḍd][āa]n/ +month_10: /Shaww[āa]l/ +// Zū al-Qaʿdah or Dhu l-Qa'da +month_11: /(Z|Dh)[ūu] a?l-Qa[ʿ']dah?/ +// Zū al-Ḥijjah or Dhu l-Hijja +month_12: /(Z|Dh)[ūu] a?l-[HḤ]ijjah?/ + + diff --git a/src/undate/converters/calendars/hijri/parser.py b/src/undate/converters/calendars/hijri/parser.py new file mode 100644 index 0000000..273cdf9 --- /dev/null +++ b/src/undate/converters/calendars/hijri/parser.py @@ -0,0 +1,9 @@ +import pathlib + +from lark import Lark + +grammar_path = pathlib.Path(__file__).parent / "hijri.lark" + +with open(grammar_path) as grammar: + # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates + hijri_parser = Lark(grammar.read(), start="hijri_date", strict=True) diff --git a/src/undate/converters/calendars/hijri/transformer.py b/src/undate/converters/calendars/hijri/transformer.py new file mode 100644 index 0000000..b575df9 --- /dev/null +++ b/src/undate/converters/calendars/hijri/transformer.py @@ -0,0 +1,40 @@ +from lark import Transformer, Tree + +from undate.undate import Undate, Calendar + + +class HijriUndate(Undate): + """Undate convience subclass; sets default calendar to Hijri.""" + + calendar = Calendar.HIJRI + + +class HijriDateTransformer(Transformer): + """Transform a Hijri date parse tree and return an Undate or + UndateInterval.""" + + def hijri_date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one integer value; + # anonymous tokens convert to their value and cast as int + value = int(child.children[0]) + parts[str(child.data)] = value + + # initialize and return an undate with islamic year, month, day and + # islamic calendar + return HijriUndate(**parts) + + # year translation is not needed since we want a tree with name year + # this is equivalent to a no-op + # def year(self, items): + # return Tree(data="year", children=[items[0]]) + + def month(self, items): + # month has a nested tree for the rule and the value + # the name of the rule (month_1, month_2, etc) gives us the + # number of the month needed for converting the date + tree = items[0] + month_n = tree.data.split("_")[-1] + return Tree(data="month", children=[month_n]) diff --git a/src/undate/converters/edtf/edtf.lark b/src/undate/converters/edtf/edtf.lark index e6f3a15..8587599 100644 --- a/src/undate/converters/edtf/edtf.lark +++ b/src/undate/converters/edtf/edtf.lark @@ -16,7 +16,7 @@ date: year | year "-" month | year "-" month "-" day year: /-?\d+/ month: /(0[1-9])|(1[0-2])/ -day: /([0-2][1-9])|(3[0-1])/ +day: /(0[1-9])|([12][0-9])|(3[01])/ timeinterval: date "/" date diff --git a/src/undate/converters/edtf/parser.py b/src/undate/converters/edtf/parser.py index 6ab5139..27c2bd6 100644 --- a/src/undate/converters/edtf/parser.py +++ b/src/undate/converters/edtf/parser.py @@ -1,45 +1,8 @@ -import os.path +import pathlib from lark import Lark -grammar_path = os.path.join(os.path.dirname(__file__), "edtf.lark") +grammar_path = pathlib.Path(__file__).parent / "edtf.lark" with open(grammar_path) as grammar: edtf_parser = Lark(grammar.read(), start="edtf") - - -# testcases = [ -# "1984", -# "1984-05", -# "1984-12", -# "1001-03-30", -# "1000/2000", -# "1000-01/2000-05-01", -# # level 1 -# "Y170000002", -# "2001-21", # spring 2001 -# # qualifiers -# "1984?", -# "2004-06~", -# "2004-06-11%", -# # unspecified digits from right -# "201X", -# "20XX", -# "2004-XX", -# "1985-04-XX", -# "1985-XX-XX", -# # open ended intervals -# "1985-04-12/..", -# "1985-04/..", -# "../1985-04-12", -# "/1985-04-12", -# "1984-13", -# ] - -# for testcase in testcases: -# print(f"\n{testcase}") -# tree = edtf_parser.parse(testcase) -# print(tree.pretty()) - - -# error_cases = ["1984-13", "Y1702"] diff --git a/src/undate/converters/edtf/transformer.py b/src/undate/converters/edtf/transformer.py index 135c93b..d5bcfcb 100644 --- a/src/undate/converters/edtf/transformer.py +++ b/src/undate/converters/edtf/transformer.py @@ -54,24 +54,19 @@ def year_unspecified(self, items): return Tree(data="year", children=[value]) def month_unspecified(self, items): + # combine multiple parts into a single string value = "".join(self.get_values(items)) return Tree(data="month", children=[value]) def day_unspecified(self, items): + # combine multiple parts into a single string value = "".join(self.get_values(items)) return Tree(data="day", children=[value]) def date_level1(self, items): return self.date(items) - def year(self, items): - # when the year is negative, there are two tokens - if len(items) > 1 and items[0] == "-": - # an anonymous token for the - and the integer year - year = items[1] - return Tree(data="year", children=[-year]) - - return Tree(data="year", children=[items[0]]) + # year (including negative years) use default transformation def year_fivedigitsplus(self, items): # strip off the leading Y and convert to integer diff --git a/src/undate/converters/iso8601.py b/src/undate/converters/iso8601.py index a0ecad5..09399eb 100644 --- a/src/undate/converters/iso8601.py +++ b/src/undate/converters/iso8601.py @@ -77,19 +77,33 @@ def _undate_to_string(self, undate: Undate) -> str: # TODO: may want to refactor and take advantage of the year/month/day properties # added for use in EDTF formatter code for date_portion, iso_format in self.iso_format.items(): + # is known means fully known, means guaranteed integer if undate.is_known(date_portion): # NOTE: datetime strftime for %Y for 3-digit year # results in leading zero in some environments # and not others; force year to always be 4 digits - if date_portion == "year": - date_parts.append("%04d" % undate.earliest.year) - elif date_portion == "month" and undate.earliest.month: - date_parts.append("%02d" % undate.earliest.month) - elif date_portion == "day" and undate.earliest.day: - date_parts.append("%02d" % undate.earliest.day) # type: ignore + if date_portion == "year" and undate.year: + try: + date_parts.append("%04d" % int(undate.year)) + except ValueError: + # shouldn't happen because of is_known + date_parts.append(undate.year) + elif date_portion == "month" and undate.month: + try: + date_parts.append("%02d" % int(undate.month)) + except ValueError: + # shouldn't happen because of is_known + date_parts.append(undate.month) + elif date_portion == "day" and undate.day: + try: + date_parts.append("%02d" % int(undate.day)) + except ValueError: + # shouldn't happen because of is_known + date_parts.append(undate.day) elif date_portion == "year": - # if not known but this is year, add '-' for --MM-DD unknown year format + # if year is not known, add '-' for year portion, + # to genereate --MM-DD unknown year format date_parts.append("-") # TODO: fix type error: "list[str | None]" is incompatible with "Iterable[str]" return "-".join(date_parts) # type: ignore diff --git a/src/undate/undate.py b/src/undate/undate.py index 7df7634..fab277c 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,6 +1,14 @@ import datetime import re -from calendar import monthrange + +from enum import auto + +try: + # StrEnum was only added in python 3.11 + from enum import StrEnum +except ImportError: + # for python 3.10 or earlier, use third-party package + from strenum import StrEnum # type: ignore # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Dict, Optional, Union @@ -9,6 +17,21 @@ from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision, Timedelta +class Calendar(StrEnum): + """Supported calendars""" + + GREGORIAN = auto() + HIJRI = auto() + HEBREW = auto() + + @staticmethod + def get_converter(calendar): + # calendar converter must be available with a name matching + # the title-case name of the calendar enum entry + converter_cls = BaseDateConverter.available_converters()[calendar.value.title()] + return converter_cls() + + class Undate: """object for representing uncertain, fuzzy or partially unknown dates""" @@ -25,9 +48,9 @@ class Undate: converter: BaseDateConverter #: precision of the date (day, month, year, etc.) precision: DatePrecision + #: the calendar this date is using; Gregorian by default + calendar: Calendar = Calendar.GREGORIAN - #: known non-leap year - NON_LEAP_YEAR: int = 2022 # numpy datetime is stored as 64-bit integer, so min/max # depends on the time unit; assume days for now # See https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units @@ -43,6 +66,7 @@ def __init__( day: Optional[Union[int, str]] = None, converter: Optional[BaseDateConverter] = None, label: Optional[str] = None, + calendar: Optional[Union[str, Calendar]] = None, ): # keep track of initial values and which values are known # TODO: add validation: if str, must be expected length @@ -58,10 +82,25 @@ def __init__( elif year: self.precision = DatePrecision.YEAR + self.label = label + if calendar is not None: + self.set_calendar(calendar) + self.calendar_converter = Calendar.get_converter(self.calendar) + + self.calculate_earliest_latest(year, month, day) + + if converter is None: + # import all subclass definitions; initialize the default + converter_cls = BaseDateConverter.available_converters()[ + self.DEFAULT_CONVERTER + ] + converter = converter_cls() + self.converter = converter + + def calculate_earliest_latest(self, year, month, day): # special case: treat year = XXXX as unknown/none if year == "XXXX": year = None - if year is not None: # could we / should we use str.isnumeric here? try: @@ -81,32 +120,34 @@ def __init__( max_year = self.MAX_ALLOWABLE_YEAR # if month is passed in as a string but completely unknown, - # treat as none - # TODO: we should preserve this information somehow; - # difference between just a year and and an unknown month within a year - # maybe in terms of date precision ? + # treat as unknown/none (date precision already set in init) if month == "XX": month = None - min_month = 1 - max_month = 12 + # get first and last month from the calendar (not always 1 and 12) + # as well as min/max months + earliest_month = self.calendar_converter.first_month() + latest_month = self.calendar_converter.last_month(max_year) + + min_month = self.calendar_converter.min_month() + max_month = self.calendar_converter.max_month(max_year) if month is not None: try: # treat as an integer if we can month = int(month) # update initial value self.initial_values["month"] = month - min_month = max_month = month + earliest_month = latest_month = month except ValueError: # if not, calculate min/max for missing digits - min_month, max_month = self._missing_digit_minmax( + earliest_month, latest_month = self._missing_digit_minmax( str(month), min_month, max_month ) - # similar to month above — unknown day, but day-level granularity if day == "XX": day = None + # if day is numeric, use as is if isinstance(day, int) or isinstance(day, str) and day.isnumeric(): day = int(day) # update initial value - fully known day @@ -114,39 +155,42 @@ def __init__( min_day = max_day = day else: # if we have no day or partial day, calculate min / max - min_day = 1 - # if we know year and month (or max month), calculate exactly - if year and month and isinstance(year, int): - _, max_day = monthrange(int(year), max_month) - elif year is None and month: - # If we don't have year and month, - # calculate based on a known non-leap year - # (better than just setting 31, but still not great) - _, max_day = monthrange(self.NON_LEAP_YEAR, max_month) - else: - max_day = 31 + min_day = 1 # is min day ever anything other than 1 ? + rel_year = year if year and isinstance(year, int) else None + # use month if it is an integer; otherwise use previusly determined + # max month (which may not be 12 depending if partially unknown) + rel_month = month if month and isinstance(month, int) else latest_month + + max_day = self.calendar_converter.max_day(rel_year, rel_month) # if day is partially specified, narrow min/max further if day is not None: min_day, max_day = self._missing_digit_minmax(day, min_day, max_day) # TODO: special case, if we get a Feb 29 date with unknown year, - # must switch the min/max years to known leap years! + # should switch the min/max years to known leap years! # for unknowns, assume smallest possible value for earliest and # largest valid for latest - self.earliest = Date(min_year, min_month, min_day) - self.latest = Date(max_year, max_month, max_day) - - if converter is None: - # import all subclass definitions; initialize the default - converter_cls = BaseDateConverter.available_converters()[ - self.DEFAULT_CONVERTER - ] - converter = converter_cls() - self.converter = converter + # convert to Gregorian calendar so earliest/latest can always + # be used for comparison + self.earliest = Date( + *self.calendar_converter.to_gregorian(min_year, earliest_month, min_day) + ) + self.latest = Date( + *self.calendar_converter.to_gregorian(max_year, latest_month, max_day) + ) - self.label = label + def set_calendar(self, calendar: Union[str, Calendar]): + if calendar is not None: + # if not passed as a Calendar instance, do a lookup + if not isinstance(calendar, Calendar): + # look for calendar by upper-case name + try: + calendar = Calendar[calendar.upper()] + except KeyError: + raise ValueError(f"Calendar `{calendar}` is not supported") + self.calendar = calendar def __str__(self) -> str: # if any portion of the date is partially known, construct @@ -170,9 +214,8 @@ def __str__(self) -> str: return self.converter.to_string(self) def __repr__(self) -> str: - if self.label: - return "" % (self.label, self) - return "" % self + label_str = f" '{self.label}'" if self.label else "" + return f"" @classmethod def parse(cls, date_string, format) -> Union["Undate", "UndateInterval"]: @@ -223,11 +266,15 @@ def __eq__(self, other: object) -> bool: if other is NotImplemented: return NotImplemented + # if both dates are fully known, then earliest/latest check + # is sufficient (and will work across calendars!) + # check for apparent equality + # - earliest/latest match and both have the same precision looks_equal = ( self.earliest == other.earliest and self.latest == other.latest - and self.initial_values == other.initial_values + and self.precision == other.precision ) # if everything looks the same, check for any unknowns in initial values # the same unknown date should NOT be considered equal @@ -237,8 +284,15 @@ def __eq__(self, other: object) -> bool: # in one format (i.e. X for missing digits). # If we support other formats, will need to normalize to common # internal format for comparison - if looks_equal and any("X" in str(val) for val in self.initial_values.values()): - return False + if looks_equal: + # if any part of either date that is known is _partially_ known, + # then these dates are not equal + if any( + [self.is_partially_known(p) for p in self.initial_values.keys()] + ) or any( + [other.is_partially_known(p) for p in other.initial_values.keys()] + ): + return False return looks_equal @@ -405,6 +459,8 @@ def _missing_digit_minmax( # given a possible range, calculate min/max values for a string # with a missing digit + # TODO: test this method directly + # assuming two digit only (i.e., month or day) possible_values = [f"{n:02}" for n in range(min_val, max_val + 1)] # ensure input value has two digits @@ -442,11 +498,14 @@ class UndateInterval: :type label: `str` """ - # date range between two uncertain dates + # date range between two undates earliest: Union[Undate, None] latest: Union[Undate, None] label: Union[str, None] + # TODO: let's think about adding an optional precision / length /size field + # using DatePrecision + def __init__( self, earliest: Optional[Undate] = None, diff --git a/tests/test_converters/edtf/test_edtf_parser.py b/tests/test_converters/edtf/test_edtf_parser.py index e9a3fdb..73d4e02 100644 --- a/tests/test_converters/edtf/test_edtf_parser.py +++ b/tests/test_converters/edtf/test_edtf_parser.py @@ -8,6 +8,7 @@ "1984-05", "1984-12", "1001-03-30", + "1901-02-20", "1000/2000", "1000-01/2000-05-01", # level 1 @@ -45,7 +46,7 @@ def test_should_parse(date_string): assert edtf_parser.parse(date_string) -error_cases = ["1984-13", "Y1702"] +error_cases = ["1984-13", "Y1702", "1984-00", "1984-01-00"] @pytest.mark.parametrize("date_string", error_cases) diff --git a/tests/test_converters/test_base.py b/tests/test_converters/test_base.py index 60d5d1e..c9578e4 100644 --- a/tests/test_converters/test_base.py +++ b/tests/test_converters/test_base.py @@ -1,7 +1,7 @@ import logging import pytest -from undate.converters.base import BaseDateConverter +from undate.converters.base import BaseDateConverter, BaseCalendarConverter class TestBaseDateConverter: @@ -18,7 +18,7 @@ def test_available_converters(self): def test_converters_are_unique(self): assert len(BaseDateConverter.available_converters()) == len( - BaseDateConverter.__subclasses__() + BaseDateConverter.subclasses() ), "Formatter names have to be unique." def test_parse_not_implemented(self): @@ -60,5 +60,17 @@ class ISO8601DateFormat2(BaseDateConverter): name = "ISO8601" # duplicates existing formatter assert len(BaseDateConverter.available_converters()) != len( - BaseDateConverter.__subclasses__() + BaseDateConverter.subclasses() ) + + +class TestBaseCalendarConverter: + def test_not_implemented(self): + with pytest.raises(NotImplementedError): + BaseCalendarConverter().min_month() + with pytest.raises(NotImplementedError): + BaseCalendarConverter().max_month(1900) + with pytest.raises(NotImplementedError): + BaseCalendarConverter().max_day(1900, 12) + with pytest.raises(NotImplementedError): + BaseCalendarConverter().to_gregorian(1900, 12, 31) diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py new file mode 100644 index 0000000..c3c8b7c --- /dev/null +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py @@ -0,0 +1,155 @@ +import pytest + +from undate.converters.calendars import HebrewDateConverter +from undate.converters.calendars.hebrew.transformer import HebrewUndate +from undate.undate import Calendar, Undate +from undate.date import DatePrecision, Date + + +class TestHebrewDateConverter: + def test_parse(self): + # day + # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056 Gregorian) + date_str = "26 Tammuz 4816" + date = HebrewDateConverter().parse(date_str) + assert date == HebrewUndate(4816, 4, 26) + assert date.calendar == Calendar.HEBREW + assert date.precision == DatePrecision.DAY + assert date.label == f"{date_str} {HebrewDateConverter.calendar_name}" + + # month + date_str = "Ṭevet 5362" + date = HebrewDateConverter().parse(date_str) + assert date == HebrewUndate(5362, 10) # Teveth = month 10 + assert date.calendar == Calendar.HEBREW + assert date.precision == DatePrecision.MONTH + assert date.label == f"{date_str} {HebrewDateConverter.calendar_name}" + + # year + date_str = "4932" + date = HebrewDateConverter().parse(date_str) + assert date == HebrewUndate(4932) + assert date.calendar == Calendar.HEBREW + assert date.precision == DatePrecision.YEAR + assert date.label == f"{date_str} {HebrewDateConverter.calendar_name}" + + def test_gregorian_earliest_latest(self): + # earliest/latest should be converted to Gregorian for comparison + + # full date + + # 26 Tammuz 4816: 17 July, 1056; Tammuz = month 4 + date = HebrewUndate(4816, 4, 26) + assert date.earliest == Date(1056, 7, 17) + assert date.latest == Date(1056, 7, 17) + # 13 Tishrei 5416 Anno Mundi (1655-10-14) + date = HebrewUndate(5416, 7, 13) # Tishrei = month 7 + assert date.earliest == Date(1655, 10, 14) + assert date.latest == Date(1655, 10, 14) + + # month + + # Ṭevet 5362 Anno Mundi (25 December, 1601 – 22 January, 1602) + date = HebrewUndate(5362, 10) + assert date.earliest == Date(1601, 12, 25) + assert date.latest == Date(1602, 1, 22) + + # year + # 5416 : October 1655 to September 1656 + date = HebrewUndate(5416) + assert date.earliest == Date(1655, 10, 2) + assert date.latest == Date(1656, 9, 18) + + def test_parse_error(self): + # a string we can't parse should raise an error + with pytest.raises(ValueError): + HebrewDateConverter().parse("January 2, 1991") + # empty string should also error + with pytest.raises(ValueError): + HebrewDateConverter().parse("") + + # non-string input should raise a type error + with pytest.raises(TypeError): + HebrewDateConverter().parse(42) + + with pytest.raises(TypeError): + HebrewDateConverter().parse({"foo": "bar"}) + + def test_partially_known(self): + # hebrew dates get existing partially unknown behavior + + converter = HebrewDateConverter() + + # hebrew first/last month are not the same as min/max + unknown_month = HebrewUndate(1243, "XX") + assert unknown_month.precision == DatePrecision.MONTH + assert unknown_month.earliest == Date( + *converter.to_gregorian(1243, converter.first_month(), 1) + ) + last_month = converter.last_month(year=1243) + assert unknown_month.latest == Date( + *converter.to_gregorian( + 1243, last_month, converter.max_day(1243, last_month) + ) + ) + + partially_unknown_month = HebrewUndate(1243, "1X") + assert partially_unknown_month.precision == DatePrecision.MONTH + assert partially_unknown_month.earliest == Date( + *converter.to_gregorian(1243, 10, 1) + ) + # for unknown digit, assume largest possible value instead + # of last semantic monthin the year + last_month = converter.max_month(year=1243) + last_day = converter.max_day(1243, last_month) + assert partially_unknown_month.latest == Date( + *converter.to_gregorian(1243, last_month, last_day) + ) + + # second month has 29 days + unknown_day = HebrewUndate(1243, 2, "XX") + assert unknown_day.precision == DatePrecision.DAY + assert unknown_day.earliest == Date(*converter.to_gregorian(1243, 2, 1)) + assert unknown_day.latest == Date(*converter.to_gregorian(1243, 2, 29)) + + partially_unknown_day = HebrewUndate(1243, 2, "2X") + assert partially_unknown_day.precision == DatePrecision.DAY + assert partially_unknown_day.earliest == Date( + *converter.to_gregorian(1243, 2, 20) + ) + assert partially_unknown_day.latest == Date( + *converter.to_gregorian(1243, 2, 29) + ) + + def test_compare_across_calendars(self): + # only day-precision dates can be exactly equal across calendars + + # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056 Gregorian) + assert HebrewUndate(4816, 4, 26) == Undate(1056, 7, 17) + # 13 Tishrei 5416; Tieshrei = month 7 (1655-10-14) + assert HebrewUndate(5416, 7, 13) == Undate(1655, 10, 14) + + # greater than / less than + assert HebrewUndate(4816) < Undate(1060) + assert HebrewUndate(5416) < Undate(1660) + assert HebrewUndate(5416, 7) > Undate(1655, 1) + assert HebrewUndate(4816, 4, 26) > Undate(1055, 5) + + # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056) + # so it falls within or is c ontained by July 1056 + assert HebrewUndate(4816, 4, 26) in Undate(1056, 7) + assert HebrewUndate(4816, 4, 26) not in Undate(1054) + + # sorting + sorted_dates = sorted( + [ + HebrewUndate(4816, 4, 26), # 1056-07-17 + HebrewUndate(5416), # 1655 + HebrewUndate(500), # -3261 + Undate(1995), + Undate(33), + Undate(1350), + ] + ) + expected_gregorian_years = [-3261, 33, 1056, 1350, 1655, 1995] + assert [d.earliest.year for d in sorted_dates] == expected_gregorian_years diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py new file mode 100644 index 0000000..69b929e --- /dev/null +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py @@ -0,0 +1,65 @@ +import pytest +from lark.exceptions import UnexpectedCharacters, UnexpectedEOF + +from undate.converters.calendars.hebrew.parser import hebrew_parser + + +# for now, just test that valid dates can be parsed + +testcases = [ + # year + "5362", + # month + year + # - with and without accent + "Ṭevet 5362", + "Tevet 5362", + "Elul 4932", + "Sivan 5581", + # variant month name, with or without accent + "Ṭeveth 5362", + "Teveth 5362", + "Iyyar 1526", + "Iyar 1526", + # day month year + "26 Tammuz 4816", + "7 Heshvan 5425", + "26 Tishrei 5416", + "26 Tishri 5416", + "14 Adar 5403", + "14 Adar I 5403", + "9 Adar II 5404", + "9 Adar Bet 5404", + # two and 1 digit years + "536", + "53", + "3", +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert hebrew_parser.parse(date_string) + + +error_cases = [ + # invalid days + ("0 Tammuz 5403", UnexpectedCharacters), + ("31 Tishri 5403", UnexpectedCharacters), + # month alone + ("Tishri", UnexpectedEOF), + # month day only + ("12 Heshvan", UnexpectedEOF), + # invalid month + ("Foo 383", UnexpectedCharacters), + # wrong format + ("2024-10-02", UnexpectedCharacters), + # year month day not supported + ("5403 Adar", UnexpectedCharacters), + ("5403 Adar 14", UnexpectedCharacters), +] + + +@pytest.mark.parametrize("date_string,exception", error_cases) +def test_should_error(date_string, exception): + with pytest.raises(exception): + hebrew_parser.parse(date_string) diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py new file mode 100644 index 0000000..6e4a5e6 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py @@ -0,0 +1,43 @@ +import pytest +from undate.converters.calendars.hebrew.parser import hebrew_parser +from undate.converters.calendars.hebrew.transformer import ( + HebrewDateTransformer, + HebrewUndate, +) +from undate.undate import Undate, Calendar +from undate.date import DatePrecision + + +def test_hebrew_undate(): + assert HebrewUndate(848).calendar == Calendar.HEBREW + + +testcases = [ + # examples from Princeton Geniza Project + # date conversions checked with https://www.muqawwim.com/ + # 26 Tammuz 4816; Tammuz = month 4 + ("26 Tammuz 4816", HebrewUndate(4816, 4, 26), DatePrecision.DAY), + ("Tammuz 4816", HebrewUndate(4816, 4), DatePrecision.MONTH), + ("4816", HebrewUndate(4816), DatePrecision.YEAR), + # 26 Tishrei 5416: Tishrei = month 7 + ("26 Tishrei 5416", HebrewUndate(5416, 7, 26), DatePrecision.DAY), + # Ṭeveth = month 10 + ("Ṭevet 5362", HebrewUndate(5362, 10), DatePrecision.MONTH), + ("5362", HebrewUndate(5362), DatePrecision.YEAR), + # add when we support parsing ranges: + # Adar I and Adar II 5453 : (1693 CE) +] + + +@pytest.mark.parametrize("date_string,expected,expected_precision", testcases) +def test_transform(date_string, expected, expected_precision): + transformer = HebrewDateTransformer(visit_tokens=True) + # parse the input string, then transform to undate object + parsetree = hebrew_parser.parse(date_string) + transformed_date = transformer.transform(parsetree) + assert transformed_date == expected + # currently only undates have date precision + if isinstance(transformed_date, Undate): + assert transformed_date.precision == expected_precision + # transformer doesn't have access to date string, + # label will need to be set by the converter class diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py new file mode 100644 index 0000000..6541586 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py @@ -0,0 +1,154 @@ +import pytest + +from undate.converters.calendars import HijriDateConverter +from undate.converters.calendars.hijri.transformer import HijriUndate +from undate.undate import Calendar, Undate +from undate.date import DatePrecision, Date + + +class TestHijriDateConverter: + def test_parse(self): + # day + # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE); Jumada I = month 5 + date_str = "7 Jumādā I 1243" + date = HijriDateConverter().parse(date_str) + assert date == HijriUndate(1243, 5, 7) + assert date.calendar == Calendar.HIJRI + assert date.precision == DatePrecision.DAY + assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" + + # month + date_str = "Rajab 495" + date = HijriDateConverter().parse(date_str) + assert date == HijriUndate(495, 7) # Rajab is month 7 + assert date.calendar == Calendar.HIJRI + assert date.precision == DatePrecision.MONTH + assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" + # Gregorian earliest/ latest + assert date.earliest == Date(1102, 4, 28) + assert date.latest == Date(1102, 5, 27) + + # year + date_str = "441" + date = HijriDateConverter().parse(date_str) + assert date == HijriUndate(441) + assert date.calendar == Calendar.HIJRI + assert date.precision == DatePrecision.YEAR + assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" + # Gregorian earliest/ latest + assert date.earliest == Date(1049, 6, 11) + assert date.latest == Date(1050, 5, 31) + + def test_gregorian_earliest_latest(self): + # earliest/latest should be converted to Gregorian for comparison + + # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE); Jumada I = month 5 + date = HijriUndate(1243, 5, 7) + assert date.earliest == Date(1827, 11, 26) + assert date.latest == Date(1827, 11, 26) + + # Jumādā I 1243 : 1827-11-20 to 1827-12-19 + date = HijriUndate(1243, 5) + assert date.earliest == Date(1827, 11, 20) + assert date.latest == Date(1827, 12, 19) + + # Rajab 495: 1102-04-28 to 1102-05-27 (Rajab = month 7) + date = HijriUndate(495, 7) + assert date.earliest == Date(1102, 4, 28) + assert date.latest == Date(1102, 5, 27) + + # 441 : 1049-06-11 to 1050-05-31 + date = HijriUndate(441) + assert date.earliest == Date(1049, 6, 11) + assert date.latest == Date(1050, 5, 31) + + # examples from ISMI data (reformatted to day month year) + # 14 Rabīʿ I 901 : 1495-12-11 (Rabi 1 = month 3 ) + date = HijriUndate(901, 3, 14) + assert date.earliest == Date(1495, 12, 11) + assert date.latest == Date(1495, 12, 11) + + # 884 : 1479-04-03 to 1480-03-21 + date = HijriUndate(884) + assert date.earliest == Date(1479, 4, 3) + assert date.latest == Date(1480, 3, 21) + + def test_parse_error(self): + # a string we can't parse should raise an error + with pytest.raises(ValueError): + HijriDateConverter().parse("January 2, 1991") + # empty string should also error + with pytest.raises(ValueError): + HijriDateConverter().parse("") + + def test_partially_known(self): + # hijri dates get existing partially unknown behavior + unknown_month = HijriUndate(1243, "XX") + assert unknown_month.precision == DatePrecision.MONTH + assert unknown_month.earliest == Date( + *HijriDateConverter().to_gregorian(1243, 1, 1) + ) + assert unknown_month.latest == Date( + *HijriDateConverter().to_gregorian(1243, 12, 30) + ) + + partially_unknown_month = HijriUndate(1243, "1X") + assert partially_unknown_month.precision == DatePrecision.MONTH + assert partially_unknown_month.earliest == Date( + *HijriDateConverter().to_gregorian(1243, 10, 1) + ) + assert partially_unknown_month.latest == Date( + *HijriDateConverter().to_gregorian(1243, 12, 30) + ) + + unknown_day = HijriUndate(1243, 2, "XX") + assert unknown_day.precision == DatePrecision.DAY + assert unknown_day.earliest == Date( + *HijriDateConverter().to_gregorian(1243, 2, 1) + ) + # second month has 29 days + assert unknown_day.latest == Date( + *HijriDateConverter().to_gregorian(1243, 2, 29) + ) + partially_unknown_day = HijriUndate(1243, 2, "2X") + assert partially_unknown_day.precision == DatePrecision.DAY + assert partially_unknown_day.earliest == Date( + *HijriDateConverter().to_gregorian(1243, 2, 20) + ) + assert partially_unknown_day.latest == Date( + *HijriDateConverter().to_gregorian(1243, 2, 29) + ) + + def test_compare_across_calendars(self): + # only day-precision dates can be exactly equal across calendars + + # 7 Jumādā I 1243 Hijrī : 26 November, 1827; Jumada I = month 5 + assert HijriUndate(1243, 5, 7) == Undate(1827, 11, 26) + # 14 Rabīʿ I 901 : 1495-12-11 (Rabi 1 = month 3 ) + assert HijriUndate(901, 3, 14) == Undate(1495, 12, 11) + + # greater than / less than + assert HijriUndate(901) < Undate(1500) + assert HijriUndate(901) > Undate(1450) + # Jumādā I 1243 : 1827-11-20 to 1827-12-19 + assert HijriUndate(1243, 5) > Undate(1827, 10) + assert HijriUndate(1243, 5) < Undate(1828, 1) + + # 7 Jumādā I 1243 Hijrī : 26 November, 1827, so it falls + # within (or is contained by) November 1827 + assert HijriUndate(1243, 5, 7) in Undate(1827, 11) + assert HijriUndate(1243, 5, 7) not in Undate(1827, 10) + + # sorting + sorted_dates = sorted( + [ + HijriUndate(884), # 1479 to 1480 Gregorian + HijriUndate(441), # 1049 to 1050 Gregorian + HijriUndate(901), # 1495 to 1495 Gregorian + Undate(1995), + Undate(33), + Undate(1350), + ] + ) + expected_gregorian_years = [33, 1049, 1350, 1479, 1495, 1995] + assert [d.earliest.year for d in sorted_dates] == expected_gregorian_years diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py new file mode 100644 index 0000000..6b9c828 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py @@ -0,0 +1,76 @@ +import pytest +from undate.converters.calendars.hijri.parser import hijri_parser + + +# for now, just test that valid dates can be parsed + +testcases = [ + # year + "521", + # month + year + # - with and without accent + "al-Muḥarram 900", + "al-Muharram 900", + "Safar 581", + "Ṣafar 581", + # variant month name, with or without accent + "Muharram 900", + "Muḥarram 900", + "Rabīʿ al-ʾAwwal 901", + "Rabi' I 901", + "Rabīʿ ath-Thānī 343", + "Rabīʿ II 343", + "Jumādā al-ʾAwwal 1081", + "Jumada al-`Awwal 1081", + "Jumādā I 1081", + "Jumādā ath-Thāniyah 901", + "Jumada ath-Thaniyah 901", + "Jumādā II 981", + "Rajab 942", + "Shaʿbān 900", + "Sha'ban 900", + "Ramaḍān 903", + "Ramadan 903", + "Shawwāl 1042", + "Shawwal 1042", + "Zū al-Qaʿdah 124", + "Dhu l-Qa'da 124", + # day month year + "7 Jumādā I 1243", + "29 Muḥarram 1243", + "30 Muḥarram 1243", + "Rabīʿ I 901", + "12 Rabīʿ I 901", + # two and 1 digit years + "12 Rabīʿ I 90", + "12 Rabīʿ I 9", +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert hijri_parser.parse(date_string) + + +error_cases = [ + # invalid days + "0 Muḥarram 1243", + "31 Muḥarram 1243", + # month alone + "Shawwal", + # month day only + "12 Shawwal", + # invalid month + "Foo 383", + # wrong format + "2024-10-02", + # year month day not supported + "901 Rabīʿ I", + "901 Rabīʿ I 12", +] + + +@pytest.mark.parametrize("date_string", error_cases) +def test_should_error(date_string): + with pytest.raises(Exception): + hijri_parser.parse(date_string) diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py new file mode 100644 index 0000000..7ebc117 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py @@ -0,0 +1,49 @@ +import pytest +from undate.converters.calendars.hijri.parser import hijri_parser +from undate.converters.calendars.hijri.transformer import ( + HijriDateTransformer, + HijriUndate, +) +from undate.undate import Undate, Calendar +from undate.date import DatePrecision + + +def test_hijri_undate(): + assert HijriUndate(848).calendar == Calendar.HIJRI + + +testcases = [ + # examples from Princeton Geniza Project + # date conversions checked with https://www.muqawwim.com/ + # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE); Jumada I = month 5 + ("7 Jumādā I 1243", HijriUndate(1243, 5, 7), DatePrecision.DAY), + ("Jumādā I 1243", HijriUndate(1243, 5), DatePrecision.MONTH), + ("1243", HijriUndate(1243), DatePrecision.YEAR), + # Gregorian: UndateInterval(Undate(1827, 7, 25), Undate(1828, 7, 13)), + # Zū al-Qaʿdah / Dhu l-Qa'da = month 11 + ("27 Dhū l-Qaʿda 632", HijriUndate(632, 11, 27), DatePrecision.DAY), + # Rajab = month 7 + ("Rajab 495", HijriUndate(495, 7), DatePrecision.MONTH), + ("441", HijriUndate(441), DatePrecision.YEAR), + # examples from ISMI data (reformatted to day month year) + # Rabi 1 = month 3 + ("14 Rabīʿ I 901", HijriUndate(901, 3, 14), DatePrecision.DAY), + ("884", HijriUndate(884), DatePrecision.YEAR), + # Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)), + # add when we support parsing ranges: + # 900 Muḥarram 1 - 999 Ḏu al-Ḥijjaẗ 29 : 1494-10-11 to 1591-10-18 +] + + +@pytest.mark.parametrize("date_string,expected,expected_precision", testcases) +def test_transform(date_string, expected, expected_precision): + transformer = HijriDateTransformer(visit_tokens=True) + # parse the input string, then transform to undate object + parsetree = hijri_parser.parse(date_string) + transformed_date = transformer.transform(parsetree) + assert transformed_date == expected + # currently only undates have date precision + if isinstance(transformed_date, Undate): + assert transformed_date.precision == expected_precision + # transformer doesn't have access to date string, + # label will need to be set by the converter class diff --git a/tests/test_undate.py b/tests/test_undate.py index 65360d3..ecf0777 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -2,8 +2,10 @@ from datetime import date import pytest + +from undate.converters.base import BaseCalendarConverter from undate.date import DatePrecision, Timedelta -from undate.undate import Undate, UndateInterval +from undate.undate import Undate, UndateInterval, Calendar class TestUndate: @@ -25,11 +27,12 @@ def test_partially_known_str(self): # assert str(Undate(2022, day=7)) == "2022-XX-07" @ currently returns 2022-07 def test_repr(self): - assert repr(Undate(2022, 11, 7)) == "" + assert repr(Undate(2022, 11, 7)) == "" assert ( repr(Undate(2022, 11, 7, label="A Special Day")) - == "" + == "" ) + assert repr(Undate(484, calendar=Calendar.HIJRI)) == "" def test_init_str(self): assert Undate("2000").earliest.year == 2000 @@ -117,6 +120,17 @@ def test_init_partially_known_day(self): # (currently causes an exception because min/max years are not leap years) # Undate(None, 2, 29) + def test_calendar(self): + assert Undate(2024).calendar == Calendar.GREGORIAN + # by name, any case + assert Undate(848, calendar="HIJRI").calendar == Calendar.HIJRI + assert Undate(848, calendar="hijri").calendar == Calendar.HIJRI + # by enum + assert Undate(848, calendar=Calendar.HIJRI).calendar == Calendar.HIJRI + # invalid + with pytest.raises(ValueError, match="Calendar `foobar` is not supported"): + Undate(848, calendar="foobar") + def test_init_invalid(self): with pytest.raises(ValueError): Undate("19xx") @@ -552,3 +566,12 @@ def test_duration(self): # one year set and the other not currently raises not implemented error with pytest.raises(NotImplementedError): UndateInterval(Undate(2000), Undate()).duration() + + +def test_calendar_get_converter(): + # ensure we can retrieve a calendar converter for each + # calendar named in our calendar enum + for cal in Calendar: + converter = Calendar.get_converter(cal) + assert isinstance(converter, BaseCalendarConverter) + assert converter.name.lower() == cal.name.lower()