From a2dfae65d3508505ee92ac245bfda8167e31b576 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 21 Nov 2024 14:57:44 -0500 Subject: [PATCH 01/36] Preliminary hijri date parser --- .../converters/calendars/hijri/__init__.py | 0 .../converters/calendars/hijri/hijri.lark | 54 +++++++++++++++++++ .../converters/calendars/hijri/parser.py | 9 ++++ .../calendars/hijri/test_parser.py | 40 ++++++++++++++ 4 files changed, 103 insertions(+) create mode 100644 src/undate/converters/calendars/hijri/__init__.py create mode 100644 src/undate/converters/calendars/hijri/hijri.lark create mode 100644 src/undate/converters/calendars/hijri/parser.py create mode 100644 tests/test_converters/calendars/hijri/test_parser.py diff --git a/src/undate/converters/calendars/hijri/__init__.py b/src/undate/converters/calendars/hijri/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/undate/converters/calendars/hijri/hijri.lark b/src/undate/converters/calendars/hijri/hijri.lark new file mode 100644 index 0000000..56103a8 --- /dev/null +++ b/src/undate/converters/calendars/hijri/hijri.lark @@ -0,0 +1,54 @@ +%import common.WS +%ignore WS + +?date: year | month year | day month year | year month day +// NOTE: ISMI sample dates are year month day +// if we can assume years are 3 digits minimum, we can support year month day +// AND we can use + +// TODO: handle date ranges? + +// TODO: is there a minimum year we need to support? +// if we can assume 3+ digit years we can distinguish between days and years, +year: /\d{3,}/ + +// months +month: month_1 + | month_2 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 +// months have 29 or 30 days; we do not expect leading zeroes +day: /[1-9]/ | /[12][0-9]/ | /30/ + +// months, in order; from convertdate list +// with variants from Princeton Geniza Project +// support matching with and without accents +month_1: /al-Mu[ḥh]arram/ | /Mu[ḥh]arram/ +month_2: /[ṢS]afar/ +// Rabīʿ al-ʾAwwal or Rabi' I +month_3: /Rab[īi][ʿ']' (al-[`ʾ]Awwal|I)/ +// Rabīʿ ath-Thānī" or Rabi' II +month_4: /Rab[īi][ʿ'] (ath-Th[āa]n[īi]|Rabi' II)/ +// Jumādā al-ʾAwwal or Jumādā I +month_5: /Jum[āa]d[āa] (al-[ʾ`]Awwal|I)/ +// Jumādā ath-Thāniya or Jumādā II +month_6: /Jum[āa]d[āa] (ath-Th[āa]niyah|II)/ +month_7: "Rajab" +// Shaʿbān +month_8: /Sha[ʿ']b[āa]n/ +month_9: /Rama[ḍd][āa]n/ +month_10: /Shaww[āa]l/ +// Zū al-Qaʿdah or Dhu l-Qa'da +month_11: /(Z|Dh)[ūu] a?l-Qa[ʿ']dah?/ +// Zū al-Ḥijjah or Dhu l-Hijja +month_12: /(Z|Dh)[ūu] a?l-[HḤ]ijjah?/ + + diff --git a/src/undate/converters/calendars/hijri/parser.py b/src/undate/converters/calendars/hijri/parser.py new file mode 100644 index 0000000..e7e7691 --- /dev/null +++ b/src/undate/converters/calendars/hijri/parser.py @@ -0,0 +1,9 @@ +import pathlib + +from lark import Lark + +grammar_path = pathlib.Path(__file__).parent / "hijri.lark" + +with open(grammar_path) as grammar: + # NOTE: LALR parser is faster but requires assumption of 3+ digit years + hijri_parser = Lark(grammar.read(), start="date", parser="lalr") diff --git a/tests/test_converters/calendars/hijri/test_parser.py b/tests/test_converters/calendars/hijri/test_parser.py new file mode 100644 index 0000000..78f37b7 --- /dev/null +++ b/tests/test_converters/calendars/hijri/test_parser.py @@ -0,0 +1,40 @@ +import pytest +from undate.converters.calendars.hijri.parser import hijri_parser + + +# for now, just test that valid dates can be parsed + +testcases = [ + # year + "521", + # month + year + # - with and without accent + "al-Muḥarram 900", + "al-Muharram 900", + "Safar 581", + "Ṣafar 581", + # variant month name, with or without accent + "Muharram 900", + "Muḥarram 900", + "Jumādā al-ʾAwwal 1081", + "Jumada al-`Awwal 1081", + "Jumādā I 1081", + "Jumādā ath-Thāniyah 901", + "Jumada ath-Thaniyah 901", + "Jumādā II 981", + "Shaʿbān 900", + "Sha'ban 900", + "Ramaḍān 903", + "Ramadan 903", + "Zū al-Qaʿdah 124", + "Dhu l-Qa'da 124", + # day month year + "7 Jumādā I 1243", + "29 Muḥarram 1243", + "30 Muḥarram 1243", +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert hijri_parser.parse(date_string) From ed23f6c48a2a536511010c75f4b70ac230bd8832 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 21 Nov 2024 15:10:38 -0500 Subject: [PATCH 02/36] Test all Hijri months; assume 3+ digit years and use LALR parser --- src/undate/converters/calendars/hijri/hijri.lark | 13 ++++++------- .../test_converters/calendars/hijri/test_parser.py | 11 +++++++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/undate/converters/calendars/hijri/hijri.lark b/src/undate/converters/calendars/hijri/hijri.lark index 56103a8..c554a52 100644 --- a/src/undate/converters/calendars/hijri/hijri.lark +++ b/src/undate/converters/calendars/hijri/hijri.lark @@ -1,10 +1,9 @@ %import common.WS %ignore WS -?date: year | month year | day month year | year month day +?date: year | month year | day month year | year month | year month day // NOTE: ISMI sample dates are year month day -// if we can assume years are 3 digits minimum, we can support year month day -// AND we can use +// if we can assume years are 3 digits minimum, we can support year month day AND we can use faster LALR parser // TODO: handle date ranges? @@ -28,15 +27,15 @@ month: month_1 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ -// months, in order; from convertdate list +// months, in order; from convertdate list // with variants from Princeton Geniza Project // support matching with and without accents month_1: /al-Mu[ḥh]arram/ | /Mu[ḥh]arram/ month_2: /[ṢS]afar/ // Rabīʿ al-ʾAwwal or Rabi' I -month_3: /Rab[īi][ʿ']' (al-[`ʾ]Awwal|I)/ -// Rabīʿ ath-Thānī" or Rabi' II -month_4: /Rab[īi][ʿ'] (ath-Th[āa]n[īi]|Rabi' II)/ +month_3: /Rab[īi][ʿ'] (al-[`ʾ]Awwal|I)/ +// Rabīʿ ath-Thānī or Rabi' II +month_4: /Rab[īi][ʿ'] (ath-Th[āa]n[īi]|II)/ // Jumādā al-ʾAwwal or Jumādā I month_5: /Jum[āa]d[āa] (al-[ʾ`]Awwal|I)/ // Jumādā ath-Thāniya or Jumādā II diff --git a/tests/test_converters/calendars/hijri/test_parser.py b/tests/test_converters/calendars/hijri/test_parser.py index 78f37b7..dc31620 100644 --- a/tests/test_converters/calendars/hijri/test_parser.py +++ b/tests/test_converters/calendars/hijri/test_parser.py @@ -16,22 +16,33 @@ # variant month name, with or without accent "Muharram 900", "Muḥarram 900", + "Rabīʿ al-ʾAwwal 901", + "Rabi' I 901", + "Rabīʿ ath-Thānī 343", + "Rabīʿ II 343", "Jumādā al-ʾAwwal 1081", "Jumada al-`Awwal 1081", "Jumādā I 1081", "Jumādā ath-Thāniyah 901", "Jumada ath-Thaniyah 901", "Jumādā II 981", + "Rajab 942", "Shaʿbān 900", "Sha'ban 900", "Ramaḍān 903", "Ramadan 903", + "Shawwāl 1042", + "Shawwal 1042", "Zū al-Qaʿdah 124", "Dhu l-Qa'da 124", # day month year "7 Jumādā I 1243", "29 Muḥarram 1243", "30 Muḥarram 1243", + # year month, if we can assume 3+ digit years + "901 Rabīʿ I", + # year month day + "901 Rabīʿ I 12", ] From 646f739ad0f02de7d79dce01cc24a87b0ca4422e Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 21 Nov 2024 15:11:11 -0500 Subject: [PATCH 03/36] Clean up edtf parser code (remove old test case comments) --- src/undate/converters/edtf/edtf.lark | 2 +- src/undate/converters/edtf/parser.py | 41 ++-------------------------- 2 files changed, 3 insertions(+), 40 deletions(-) diff --git a/src/undate/converters/edtf/edtf.lark b/src/undate/converters/edtf/edtf.lark index e6f3a15..677fa98 100644 --- a/src/undate/converters/edtf/edtf.lark +++ b/src/undate/converters/edtf/edtf.lark @@ -16,7 +16,7 @@ date: year | year "-" month | year "-" month "-" day year: /-?\d+/ month: /(0[1-9])|(1[0-2])/ -day: /([0-2][1-9])|(3[0-1])/ +day: /([0-2][0-9])|(3[0-1])/ timeinterval: date "/" date diff --git a/src/undate/converters/edtf/parser.py b/src/undate/converters/edtf/parser.py index 6ab5139..27c2bd6 100644 --- a/src/undate/converters/edtf/parser.py +++ b/src/undate/converters/edtf/parser.py @@ -1,45 +1,8 @@ -import os.path +import pathlib from lark import Lark -grammar_path = os.path.join(os.path.dirname(__file__), "edtf.lark") +grammar_path = pathlib.Path(__file__).parent / "edtf.lark" with open(grammar_path) as grammar: edtf_parser = Lark(grammar.read(), start="edtf") - - -# testcases = [ -# "1984", -# "1984-05", -# "1984-12", -# "1001-03-30", -# "1000/2000", -# "1000-01/2000-05-01", -# # level 1 -# "Y170000002", -# "2001-21", # spring 2001 -# # qualifiers -# "1984?", -# "2004-06~", -# "2004-06-11%", -# # unspecified digits from right -# "201X", -# "20XX", -# "2004-XX", -# "1985-04-XX", -# "1985-XX-XX", -# # open ended intervals -# "1985-04-12/..", -# "1985-04/..", -# "../1985-04-12", -# "/1985-04-12", -# "1984-13", -# ] - -# for testcase in testcases: -# print(f"\n{testcase}") -# tree = edtf_parser.parse(testcase) -# print(tree.pretty()) - - -# error_cases = ["1984-13", "Y1702"] From 51850cc0d4a71903f366c5a4146f07a29ee7af95 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 21 Nov 2024 16:36:23 -0500 Subject: [PATCH 04/36] Add transformer for hijri parser to convert parsed date to undate --- pyproject.toml | 2 +- .../converters/calendars/hijri/hijri.lark | 2 +- .../converters/calendars/hijri/parser.py | 2 +- .../converters/calendars/hijri/transformer.py | 56 +++++++++++++++++++ src/undate/undate.py | 5 +- .../calendars/hijri/test_hijri_transformer.py | 56 +++++++++++++++++++ 6 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 src/undate/converters/calendars/hijri/transformer.py create mode 100644 tests/test_converters/calendars/hijri/test_hijri_transformer.py diff --git a/pyproject.toml b/pyproject.toml index 9179ca0..da206e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ readme = "README.md" license = { text = "Apache-2" } requires-python = ">= 3.9" dynamic = ["version"] -dependencies = ["lark", "numpy"] +dependencies = ["lark[interegular]", "numpy", "convertdate"] authors = [ { name = "Rebecca Sutton Koeser" }, { name = "Cole Crawford" }, diff --git a/src/undate/converters/calendars/hijri/hijri.lark b/src/undate/converters/calendars/hijri/hijri.lark index c554a52..79b55c6 100644 --- a/src/undate/converters/calendars/hijri/hijri.lark +++ b/src/undate/converters/calendars/hijri/hijri.lark @@ -1,7 +1,7 @@ %import common.WS %ignore WS -?date: year | month year | day month year | year month | year month day +hijri_date: year | month year | day month year | year month | year month day // NOTE: ISMI sample dates are year month day // if we can assume years are 3 digits minimum, we can support year month day AND we can use faster LALR parser diff --git a/src/undate/converters/calendars/hijri/parser.py b/src/undate/converters/calendars/hijri/parser.py index e7e7691..df13a40 100644 --- a/src/undate/converters/calendars/hijri/parser.py +++ b/src/undate/converters/calendars/hijri/parser.py @@ -6,4 +6,4 @@ with open(grammar_path) as grammar: # NOTE: LALR parser is faster but requires assumption of 3+ digit years - hijri_parser = Lark(grammar.read(), start="date", parser="lalr") + hijri_parser = Lark(grammar.read(), start="hijri_date", strict=True, parser="lalr") diff --git a/src/undate/converters/calendars/hijri/transformer.py b/src/undate/converters/calendars/hijri/transformer.py new file mode 100644 index 0000000..10f5239 --- /dev/null +++ b/src/undate/converters/calendars/hijri/transformer.py @@ -0,0 +1,56 @@ +from lark import Transformer, Tree +from convertdate import islamic + +from undate.undate import Undate, UndateInterval + + +class HijriDateTransformer(Transformer): + """Transform a Hijri date parse tree and return an Undate or + UndateInterval.""" + + def hijri_date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one integer value; + # anonymous tokens convert to their value and cast as int + value = int(child.children[0]) + parts[str(child.data)] = value + + # if we have a year, month, day, convert to a single undate + if len(parts.values()) == 3: + # convertdate returns a tuple of year, month day + converted_date = islamic.to_gregorian(**parts) + return Undate(*converted_date) + + # if not, convert to a date range + start, end = islamic_to_gregorian_interval(**parts) + # TODO: should we add optional date precision / interval length + # to UndateInteravl ? + return UndateInterval(Undate(*start), Undate(*end)) + + # this does nothing + # def year(self, items): + # return Tree(data="year", children=[items[0]]) + + def month(self, items): + # month has a nested tree for the rule and the value + # the name of the rule (month_1, month_2, etc) gives us the + # number of the month needed for converting the date + tree = items[0] + month_n = tree.data.split("_")[-1] + return Tree(data="month", children=[month_n]) + + +MIN_MONTH, MIN_DAY = 1, 1 +MAX_MONTH = 12 + + +def islamic_to_gregorian_interval(year, month=None, day=None): + start = (year, month or MIN_MONTH, day or MIN_DAY) + end_month = month or MAX_MONTH + # islamic calendar converter has month_length + if day is None: + day = islamic.month_length(year, end_month) + end = (year, month or MAX_MONTH, day) + return (islamic.to_gregorian(*start), islamic.to_gregorian(*end)) diff --git a/src/undate/undate.py b/src/undate/undate.py index 7df7634..f848474 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -442,11 +442,14 @@ class UndateInterval: :type label: `str` """ - # date range between two uncertain dates + # date range between two undates earliest: Union[Undate, None] latest: Union[Undate, None] label: Union[str, None] + # TODO: let's think about adding an optional precision / length /size field + # using DatePrecision + def __init__( self, earliest: Optional[Undate] = None, diff --git a/tests/test_converters/calendars/hijri/test_hijri_transformer.py b/tests/test_converters/calendars/hijri/test_hijri_transformer.py new file mode 100644 index 0000000..096bae7 --- /dev/null +++ b/tests/test_converters/calendars/hijri/test_hijri_transformer.py @@ -0,0 +1,56 @@ +import pytest +from undate.converters.calendars.hijri.parser import hijri_parser +from undate.converters.calendars.hijri.transformer import HijriDateTransformer +from undate.undate import Undate, UndateInterval +from undate.date import DatePrecision + +testcases = [ + # examples from Princeton Geniza Project + # date conversions checked with https://www.muqawwim.com/ + # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE) + ("7 Jumādā I 1243", Undate(1827, 11, 26), DatePrecision.DAY), + ( + "Jumādā I 1243", + UndateInterval(Undate(1827, 11, 20), Undate(1827, 12, 19)), + DatePrecision.MONTH, + ), + ( + "1243", + UndateInterval(Undate(1827, 7, 25), Undate(1828, 7, 13)), + DatePrecision.YEAR, + ), + ("27 Dhū l-Qaʿda 632", Undate(1235, 8, 20), DatePrecision.DAY), + ( + "Rajab 495", + UndateInterval(Undate(1102, 4, 28), Undate(1102, 5, 27)), + DatePrecision.MONTH, + ), + ( + "441", + UndateInterval(Undate(1049, 6, 11), Undate(1050, 5, 31)), + DatePrecision.YEAR, + ), + # examples from ISMI data + ("901 Rabīʿ I 14", Undate(1495, 12, 11), DatePrecision.DAY), + ( + "884", + UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)), + DatePrecision.YEAR, + ), + # add when we support parsing ranges: + # 900 Muḥarram 1 - 999 Ḏu al-Ḥijjaẗ 29 : 1494-10-11 to 1591-10-18 +] + + +@pytest.mark.parametrize("date_string,expected,expected_precision", testcases) +def test_transform(date_string, expected, expected_precision): + transformer = HijriDateTransformer(visit_tokens=True) + # parse the input string, then transform to undate object + parsetree = hijri_parser.parse(date_string) + transformed_date = transformer.transform(parsetree) + assert transformed_date == expected + # currently only undates have date precision + if isinstance(transformed_date, Undate): + assert transformed_date.precision == expected_precision + # transformer doesn't have access to date string, + # label will need to be set by the converter class From 50f233185fbd5cc04c85cff4492bf767c7eeb0ed Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 21 Nov 2024 16:37:21 -0500 Subject: [PATCH 05/36] Rename test directories & files to be consistent & explicit --- .../test_hijri/test_hijri_parser.py} | 0 .../hijri => test_calendars/test_hijri}/test_hijri_transformer.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/test_converters/{calendars/hijri/test_parser.py => test_calendars/test_hijri/test_hijri_parser.py} (100%) rename tests/test_converters/{calendars/hijri => test_calendars/test_hijri}/test_hijri_transformer.py (100%) diff --git a/tests/test_converters/calendars/hijri/test_parser.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py similarity index 100% rename from tests/test_converters/calendars/hijri/test_parser.py rename to tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py diff --git a/tests/test_converters/calendars/hijri/test_hijri_transformer.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py similarity index 100% rename from tests/test_converters/calendars/hijri/test_hijri_transformer.py rename to tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py From 778c67b2b98f97ac293bdbb3ff34a02c322cb72f Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 21 Nov 2024 16:58:21 -0500 Subject: [PATCH 06/36] Add an undate converter to wire in hijri date parsing capability --- src/undate/converters/calendars/__init__.py | 3 ++ .../converters/calendars/hijri/__init__.py | 3 ++ .../converters/calendars/hijri/converter.py | 48 +++++++++++++++++++ .../test_hijri/test_hijri_converter.py | 32 +++++++++++++ 4 files changed, 86 insertions(+) create mode 100644 src/undate/converters/calendars/__init__.py create mode 100644 src/undate/converters/calendars/hijri/converter.py create mode 100644 tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py new file mode 100644 index 0000000..edc3efc --- /dev/null +++ b/src/undate/converters/calendars/__init__.py @@ -0,0 +1,3 @@ +from undate.converters.calendars.hijri import HijriDateConverter + +__all__ = ["HijriDateConverter"] diff --git a/src/undate/converters/calendars/hijri/__init__.py b/src/undate/converters/calendars/hijri/__init__.py index e69de29..4ac5b4b 100644 --- a/src/undate/converters/calendars/hijri/__init__.py +++ b/src/undate/converters/calendars/hijri/__init__.py @@ -0,0 +1,3 @@ +from undate.converters.calendars.hijri.converter import HijriDateConverter + +__all__ = ["HijriDateConverter"] diff --git a/src/undate/converters/calendars/hijri/converter.py b/src/undate/converters/calendars/hijri/converter.py new file mode 100644 index 0000000..0502a12 --- /dev/null +++ b/src/undate/converters/calendars/hijri/converter.py @@ -0,0 +1,48 @@ +from typing import Union + +from lark.exceptions import UnexpectedCharacters + +from undate.converters.base import BaseDateConverter +from undate.converters.calendars.hijri.parser import hijri_parser +from undate.converters.calendars.hijri.transformer import HijriDateTransformer +from undate.undate import Undate, UndateInterval + + +class HijriDateConverter(BaseDateConverter): + """ + Converter for Hijri / Islamic calendar. + + Support for parsing Hijri dates and converting to Undate and UndateInterval + objects in the Gregorian calendar. + """ + + #: converter name: Hijri + name: str = "Hijri" + calendar_name: str = "Hijrī" + + def __init__(self): + self.transformer = HijriDateTransformer() + + def parse(self, value: str) -> Union[Undate, UndateInterval]: + """ + Parse a Hijri date string and return an :class:`~undate.undate.Undate` or + :class:`~undate.undate.UndateInterval` in Gregorian calendar. + The Hijri date string is preserved in the undate label + """ + if not value: + raise ValueError("Parsing empty string is not supported") + + # parse the input string, then transform to undate object + try: + # parse the string with our Hijri date parser + parsetree = hijri_parser.parse(value) + # transform the parse tree into an undate or undate interval + undate_obj = self.transformer.transform(parsetree) + # set the original date as a label, with the calendar name + undate_obj.label = f"{value} {self.calendar_name}" + return undate_obj + except UnexpectedCharacters: + raise ValueError("Could not parse '%s' as a Hijri date" % value) + + # do we need to support conversion the other direction? + # i.e., generate a Hijri date from an abitrary undate or undate interval? diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py new file mode 100644 index 0000000..6493083 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py @@ -0,0 +1,32 @@ +import pytest +from undate.converters.calendars import HijriDateConverter +from undate.undate import Undate, UndateInterval + + +class TestHijriDateConverter: + def test_parse_(self): + # day + date_str = "7 Jumādā I 1243" + date = HijriDateConverter().parse(date_str) + assert date == Undate(1827, 11, 26) + assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" + + # month + date_str = "Rajab 495" + date = HijriDateConverter().parse(date_str) + assert date == UndateInterval(Undate(1102, 4, 28), Undate(1102, 5, 27)) + assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" + + # year + date_str = "441" + date = HijriDateConverter().parse(date_str) + assert date == UndateInterval(Undate(1049, 6, 11), Undate(1050, 5, 31)) + assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" + + def test_parse_error(self): + # a string we can't parse should raise an error + with pytest.raises(ValueError): + HijriDateConverter().parse("January 2, 1991") + # empty string should also error + with pytest.raises(ValueError): + HijriDateConverter().parse("") From 99c06119695d63942c4a18a0aec1ec4f9205e682 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 21 Nov 2024 17:07:35 -0500 Subject: [PATCH 07/36] Tell mypy to ignore that convertdate code is untyped --- src/undate/converters/calendars/hijri/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/undate/converters/calendars/hijri/transformer.py b/src/undate/converters/calendars/hijri/transformer.py index 10f5239..31d0992 100644 --- a/src/undate/converters/calendars/hijri/transformer.py +++ b/src/undate/converters/calendars/hijri/transformer.py @@ -1,5 +1,5 @@ from lark import Transformer, Tree -from convertdate import islamic +from convertdate import islamic # type: ignore from undate.undate import Undate, UndateInterval From 315ad7a9db81f033daaa15bf2f9fa9316d2c32ba Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 21 Nov 2024 17:32:00 -0500 Subject: [PATCH 08/36] Clean up one more date and add more possible todos --- src/undate/converters/calendars/hijri/hijri.lark | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/undate/converters/calendars/hijri/hijri.lark b/src/undate/converters/calendars/hijri/hijri.lark index 79b55c6..ae8d73e 100644 --- a/src/undate/converters/calendars/hijri/hijri.lark +++ b/src/undate/converters/calendars/hijri/hijri.lark @@ -7,6 +7,10 @@ hijri_date: year | month year | day month year | year month | year month day // TODO: handle date ranges? +// TODO: add support for qualifiers? +// PGP dates use qualifiers like "first decade of" (for beginning of month) +// "first third of", seasons (can look for more examples) + // TODO: is there a minimum year we need to support? // if we can assume 3+ digit years we can distinguish between days and years, year: /\d{3,}/ @@ -30,7 +34,8 @@ day: /[1-9]/ | /[12][0-9]/ | /30/ // months, in order; from convertdate list // with variants from Princeton Geniza Project // support matching with and without accents -month_1: /al-Mu[ḥh]arram/ | /Mu[ḥh]arram/ +// al-Muḥarram or Muḥarram +month_1: /(al-)?Mu[ḥh]arram/ month_2: /[ṢS]afar/ // Rabīʿ al-ʾAwwal or Rabi' I month_3: /Rab[īi][ʿ'] (al-[`ʾ]Awwal|I)/ From 18c8f259922bc578575172efab74ec38056b83bc Mon Sep 17 00:00:00 2001 From: Rebecca Sutton Koeser Date: Thu, 21 Nov 2024 17:35:29 -0500 Subject: [PATCH 09/36] Update src/undate/converters/calendars/hijri/transformer.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- src/undate/converters/calendars/hijri/transformer.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/undate/converters/calendars/hijri/transformer.py b/src/undate/converters/calendars/hijri/transformer.py index 31d0992..c3a059c 100644 --- a/src/undate/converters/calendars/hijri/transformer.py +++ b/src/undate/converters/calendars/hijri/transformer.py @@ -47,10 +47,20 @@ def month(self, items): def islamic_to_gregorian_interval(year, month=None, day=None): + """Convert partial Hijri date to a Gregorian date interval. + + Args: + year (int): Hijri year + month (int, optional): Hijri month (1-12) + day (int, optional): Hijri day (1-30) + + Returns: + tuple: (start_date, end_date) as tuples of (year, month, day) + """ start = (year, month or MIN_MONTH, day or MIN_DAY) end_month = month or MAX_MONTH # islamic calendar converter has month_length if day is None: day = islamic.month_length(year, end_month) - end = (year, month or MAX_MONTH, day) + end = (year, end_month, day) # Use end_month instead of redundant check return (islamic.to_gregorian(*start), islamic.to_gregorian(*end)) From f3ce58b6f714726c372c770006eaea3a6d641fb6 Mon Sep 17 00:00:00 2001 From: Rebecca Sutton Koeser Date: Thu, 21 Nov 2024 17:35:53 -0500 Subject: [PATCH 10/36] Update src/undate/converters/edtf/edtf.lark Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- src/undate/converters/edtf/edtf.lark | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/undate/converters/edtf/edtf.lark b/src/undate/converters/edtf/edtf.lark index 677fa98..8587599 100644 --- a/src/undate/converters/edtf/edtf.lark +++ b/src/undate/converters/edtf/edtf.lark @@ -16,7 +16,7 @@ date: year | year "-" month | year "-" month "-" day year: /-?\d+/ month: /(0[1-9])|(1[0-2])/ -day: /([0-2][0-9])|(3[0-1])/ +day: /(0[1-9])|([12][0-9])|(3[01])/ timeinterval: date "/" date From 11cc007bca8409118b8ef25f2a900c5e7c665cfa Mon Sep 17 00:00:00 2001 From: Rebecca Sutton Koeser Date: Thu, 21 Nov 2024 17:36:19 -0500 Subject: [PATCH 11/36] Update src/undate/converters/calendars/hijri/converter.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- src/undate/converters/calendars/hijri/converter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/undate/converters/calendars/hijri/converter.py b/src/undate/converters/calendars/hijri/converter.py index 0502a12..e040d6e 100644 --- a/src/undate/converters/calendars/hijri/converter.py +++ b/src/undate/converters/calendars/hijri/converter.py @@ -41,8 +41,8 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: # set the original date as a label, with the calendar name undate_obj.label = f"{value} {self.calendar_name}" return undate_obj - except UnexpectedCharacters: - raise ValueError("Could not parse '%s' as a Hijri date" % value) + except UnexpectedCharacters as err: + raise ValueError(f"Could not parse '{value}' as a Hijri date") from err # do we need to support conversion the other direction? # i.e., generate a Hijri date from an abitrary undate or undate interval? From 2cc596eb9a130270626b0ad5f257ebd2c0832dd8 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 21 Nov 2024 17:38:09 -0500 Subject: [PATCH 12/36] Add more error cases for EDTF and Hijri parser tests --- .../test_converters/edtf/test_edtf_parser.py | 3 ++- .../test_hijri/test_hijri_parser.py | 21 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/test_converters/edtf/test_edtf_parser.py b/tests/test_converters/edtf/test_edtf_parser.py index e9a3fdb..73d4e02 100644 --- a/tests/test_converters/edtf/test_edtf_parser.py +++ b/tests/test_converters/edtf/test_edtf_parser.py @@ -8,6 +8,7 @@ "1984-05", "1984-12", "1001-03-30", + "1901-02-20", "1000/2000", "1000-01/2000-05-01", # level 1 @@ -45,7 +46,7 @@ def test_should_parse(date_string): assert edtf_parser.parse(date_string) -error_cases = ["1984-13", "Y1702"] +error_cases = ["1984-13", "Y1702", "1984-00", "1984-01-00"] @pytest.mark.parametrize("date_string", error_cases) diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py index dc31620..9d465ef 100644 --- a/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py @@ -49,3 +49,24 @@ @pytest.mark.parametrize("date_string", testcases) def test_should_parse(date_string): assert hijri_parser.parse(date_string) + + +error_cases = [ + # invalid days + "0 Muḥarram 1243", + "31 Muḥarram 1243", + # month alone + "Shawwal", + # month day only + "12 Shawwal", + # invalid month + "Foo 383", + # wrong format + "2024-10-02", +] + + +@pytest.mark.parametrize("date_string", error_cases) +def test_should_error(date_string): + with pytest.raises(Exception): + hijri_parser.parse(date_string) From 0aac63a9de4fa62cafef821248cf302fcf860db7 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 22 Nov 2024 15:45:31 -0500 Subject: [PATCH 13/36] Add calendar field to Undate object --- src/undate/undate.py | 21 +++++++++++++++++++++ tests/test_undate.py | 13 ++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/undate/undate.py b/src/undate/undate.py index f848474..137c799 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,6 +1,7 @@ import datetime import re from calendar import monthrange +from enum import StrEnum, auto # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Dict, Optional, Union @@ -9,6 +10,13 @@ from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision, Timedelta +class Calendar(StrEnum): + """Supported calendars""" + + GREGORIAN = auto() + HIJRI = auto() + + class Undate: """object for representing uncertain, fuzzy or partially unknown dates""" @@ -25,6 +33,8 @@ class Undate: converter: BaseDateConverter #: precision of the date (day, month, year, etc.) precision: DatePrecision + #: the calendar this date is using; Gregorian by default + calendar: Calendar = Calendar.GREGORIAN #: known non-leap year NON_LEAP_YEAR: int = 2022 @@ -43,6 +53,7 @@ def __init__( day: Optional[Union[int, str]] = None, converter: Optional[BaseDateConverter] = None, label: Optional[str] = None, + calendar: Optional[Union[str, Calendar]] = None, ): # keep track of initial values and which values are known # TODO: add validation: if str, must be expected length @@ -58,6 +69,16 @@ def __init__( elif year: self.precision = DatePrecision.YEAR + if calendar is not None: + # if not passed as a Calendar instance, do a lookup + if not isinstance(calendar, Calendar): + # look for calendar by upper-case name + try: + calendar = Calendar[calendar.upper()] + except KeyError: + raise ValueError(f"Calendar `{calendar}` is not supported") + self.calendar = calendar + # special case: treat year = XXXX as unknown/none if year == "XXXX": year = None diff --git a/tests/test_undate.py b/tests/test_undate.py index 65360d3..fd4c169 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -3,7 +3,7 @@ import pytest from undate.date import DatePrecision, Timedelta -from undate.undate import Undate, UndateInterval +from undate.undate import Undate, UndateInterval, Calendar class TestUndate: @@ -117,6 +117,17 @@ def test_init_partially_known_day(self): # (currently causes an exception because min/max years are not leap years) # Undate(None, 2, 29) + def test_calendar(self): + assert Undate(2024).calendar == Calendar.GREGORIAN + # by name, any case + assert Undate(848, calendar="HIJRI").calendar == Calendar.HIJRI + assert Undate(848, calendar="hijri").calendar == Calendar.HIJRI + # by enum + assert Undate(848, calendar=Calendar.HIJRI).calendar == Calendar.HIJRI + # invalid + with pytest.raises(ValueError, match="Calendar `foobar` is not supported"): + Undate(848, calendar="foobar") + def test_init_invalid(self): with pytest.raises(ValueError): Undate("19xx") From e2444edd36cf322fb46a451a5b5e166434439ca5 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 14:16:54 -0500 Subject: [PATCH 14/36] Partial refactor: initialize hijri dates as undate with hijri calendar --- .../converters/calendars/hijri/transformer.py | 33 ++++++---- src/undate/undate.py | 40 ++++++------ .../test_hijri/test_hijri_converter.py | 24 ++++++-- .../test_hijri/test_hijri_transformer.py | 61 +++++++++---------- 4 files changed, 93 insertions(+), 65 deletions(-) diff --git a/src/undate/converters/calendars/hijri/transformer.py b/src/undate/converters/calendars/hijri/transformer.py index c3a059c..cbb9387 100644 --- a/src/undate/converters/calendars/hijri/transformer.py +++ b/src/undate/converters/calendars/hijri/transformer.py @@ -1,7 +1,13 @@ from lark import Transformer, Tree from convertdate import islamic # type: ignore -from undate.undate import Undate, UndateInterval +from undate.undate import Undate, Calendar + + +class HijriUndate(Undate): + """Undate convience subclass; sets default calendar to Hijri.""" + + calendar = Calendar.HIJRI class HijriDateTransformer(Transformer): @@ -17,17 +23,20 @@ def hijri_date(self, items): value = int(child.children[0]) parts[str(child.data)] = value - # if we have a year, month, day, convert to a single undate - if len(parts.values()) == 3: - # convertdate returns a tuple of year, month day - converted_date = islamic.to_gregorian(**parts) - return Undate(*converted_date) - - # if not, convert to a date range - start, end = islamic_to_gregorian_interval(**parts) - # TODO: should we add optional date precision / interval length - # to UndateInteravl ? - return UndateInterval(Undate(*start), Undate(*end)) + print(f"*** initializing undate with {parts} and Hijri calendar") + return HijriUndate(**parts) + + # # if we have a year, month, day, convert to a single undate + # if len(parts.values()) == 3: + # # convertdate returns a tuple of year, month day + # converted_date = islamic.to_gregorian(**parts) + # return Undate(*converted_date) + + # # if not, convert to a date range + # start, end = islamic_to_gregorian_interval(**parts) + # # TODO: should we add optional date precision / interval length + # # to UndateInteravl ? + # return UndateInterval(Undate(*start), Undate(*end)) # this does nothing # def year(self, items): diff --git a/src/undate/undate.py b/src/undate/undate.py index 137c799..be5a813 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -69,20 +69,25 @@ def __init__( elif year: self.precision = DatePrecision.YEAR + self.label = label if calendar is not None: - # if not passed as a Calendar instance, do a lookup - if not isinstance(calendar, Calendar): - # look for calendar by upper-case name - try: - calendar = Calendar[calendar.upper()] - except KeyError: - raise ValueError(f"Calendar `{calendar}` is not supported") - self.calendar = calendar + self.set_calendar(calendar) # special case: treat year = XXXX as unknown/none if year == "XXXX": year = None + self.calculate_earliest_latest(year, month, day) + + if converter is None: + # import all subclass definitions; initialize the default + converter_cls = BaseDateConverter.available_converters()[ + self.DEFAULT_CONVERTER + ] + converter = converter_cls() + self.converter = converter + + def calculate_earliest_latest(self, year, month, day): if year is not None: # could we / should we use str.isnumeric here? try: @@ -159,15 +164,16 @@ def __init__( self.earliest = Date(min_year, min_month, min_day) self.latest = Date(max_year, max_month, max_day) - if converter is None: - # import all subclass definitions; initialize the default - converter_cls = BaseDateConverter.available_converters()[ - self.DEFAULT_CONVERTER - ] - converter = converter_cls() - self.converter = converter - - self.label = label + def set_calendar(self, calendar: Union[str, Calendar]): + if calendar is not None: + # if not passed as a Calendar instance, do a lookup + if not isinstance(calendar, Calendar): + # look for calendar by upper-case name + try: + calendar = Calendar[calendar.upper()] + except KeyError: + raise ValueError(f"Calendar `{calendar}` is not supported") + self.calendar = calendar def __str__(self) -> str: # if any portion of the date is partially known, construct diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py index 6493083..f74d412 100644 --- a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py @@ -1,27 +1,43 @@ import pytest + from undate.converters.calendars import HijriDateConverter -from undate.undate import Undate, UndateInterval +from undate.undate import Undate, Calendar +from undate.date import DatePrecision class TestHijriDateConverter: def test_parse_(self): # day + # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE); Jumada I = month 5 date_str = "7 Jumādā I 1243" date = HijriDateConverter().parse(date_str) - assert date == Undate(1827, 11, 26) + assert date == Undate(1243, 5, 7) + assert date.calendar == Calendar.HIJRI + assert date.precision == DatePrecision.DAY assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" + # TODO: earliest/latest should be converted to Gregorian + # assert date.earliest == Date(1827, 11, 26) + # assert date.latest == Date(1827, 11, 26) # month date_str = "Rajab 495" date = HijriDateConverter().parse(date_str) - assert date == UndateInterval(Undate(1102, 4, 28), Undate(1102, 5, 27)) + assert date == Undate(495, 7) # Rajab is month 7 + assert date.calendar == Calendar.HIJRI + assert date.precision == DatePrecision.MONTH assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" + # TODO: Gregorian earliest/ latest + # assert date == UndateInterval(Undate(1102, 4, 28), Undate(1102, 5, 27)) # year date_str = "441" date = HijriDateConverter().parse(date_str) - assert date == UndateInterval(Undate(1049, 6, 11), Undate(1050, 5, 31)) + assert date == Undate(441) + assert date.calendar == Calendar.HIJRI + assert date.precision == DatePrecision.YEAR assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" + # TODO: Gregorian earliest/ latest + # assert date == UndateInterval(Undate(1049, 6, 11), Undate(1050, 5, 31)) def test_parse_error(self): # a string we can't parse should raise an error diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py index 096bae7..0cb5aa0 100644 --- a/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py @@ -1,42 +1,39 @@ import pytest from undate.converters.calendars.hijri.parser import hijri_parser -from undate.converters.calendars.hijri.transformer import HijriDateTransformer -from undate.undate import Undate, UndateInterval +from undate.converters.calendars.hijri.transformer import ( + HijriDateTransformer, + HijriUndate, +) +from undate.undate import Undate, Calendar from undate.date import DatePrecision + +def test_hijri_undate(): + assert HijriUndate(848).calendar == Calendar.HIJRI + + testcases = [ # examples from Princeton Geniza Project # date conversions checked with https://www.muqawwim.com/ - # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE) - ("7 Jumādā I 1243", Undate(1827, 11, 26), DatePrecision.DAY), - ( - "Jumādā I 1243", - UndateInterval(Undate(1827, 11, 20), Undate(1827, 12, 19)), - DatePrecision.MONTH, - ), - ( - "1243", - UndateInterval(Undate(1827, 7, 25), Undate(1828, 7, 13)), - DatePrecision.YEAR, - ), - ("27 Dhū l-Qaʿda 632", Undate(1235, 8, 20), DatePrecision.DAY), - ( - "Rajab 495", - UndateInterval(Undate(1102, 4, 28), Undate(1102, 5, 27)), - DatePrecision.MONTH, - ), - ( - "441", - UndateInterval(Undate(1049, 6, 11), Undate(1050, 5, 31)), - DatePrecision.YEAR, - ), - # examples from ISMI data - ("901 Rabīʿ I 14", Undate(1495, 12, 11), DatePrecision.DAY), - ( - "884", - UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)), - DatePrecision.YEAR, - ), + # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE); Jumada I = month 5 + ("7 Jumādā I 1243", HijriUndate(1243, 5, 7), DatePrecision.DAY), + ("Jumādā I 1243", HijriUndate(1243, 5), DatePrecision.MONTH), + # Gregorian: UndateInterval(Undate(1827, 11, 20), Undate(1827, 12, 19)) + ("1243", HijriUndate(1243), DatePrecision.YEAR), + # Gregorian: UndateInterval(Undate(1827, 7, 25), Undate(1828, 7, 13)), + # Zū al-Qaʿdah / Dhu l-Qa'da = month 11 + ("27 Dhū l-Qaʿda 632", HijriUndate(632, 11, 27), DatePrecision.DAY), + # Rajab = month 7 + ("Rajab 495", HijriUndate(495, 7), DatePrecision.MONTH), + # Gregorian: UndateInterval(Undate(1102, 4, 28), Undate(1102, 5, 27)), + ("441", HijriUndate(441), DatePrecision.YEAR), + # Gregorian: UndateInterval(Undate(1049, 6, 11), Undate(1050, 5, 31)), + # examples from ISMI data (reformatted to day month year) + # Rabi 1 = month 3 + ("14 Rabīʿ I 901", HijriUndate(901, 3, 14), DatePrecision.DAY), + # Gregorian: Undate(1495, 12, 11) + ("884", HijriUndate(884), DatePrecision.YEAR), + # Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)), # add when we support parsing ranges: # 900 Muḥarram 1 - 999 Ḏu al-Ḥijjaẗ 29 : 1494-10-11 to 1591-10-18 ] From 3aa462b2da2366c5fb5640845ecf6c09fb3eb8de Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 16:24:10 -0500 Subject: [PATCH 15/36] Use calendar converter to get max month/day and convert to gregorian --- src/undate/converters/calendars/__init__.py | 3 +- src/undate/converters/calendars/gregorian.py | 41 ++++++++++++ .../converters/calendars/hijri/converter.py | 15 +++++ .../converters/calendars/hijri/transformer.py | 43 ++----------- src/undate/undate.py | 63 ++++++++++--------- .../test_hijri/test_hijri_converter.py | 60 ++++++++++++++---- .../test_hijri/test_hijri_transformer.py | 4 -- tests/test_undate.py | 10 +++ 8 files changed, 154 insertions(+), 85 deletions(-) create mode 100644 src/undate/converters/calendars/gregorian.py diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py index edc3efc..635af21 100644 --- a/src/undate/converters/calendars/__init__.py +++ b/src/undate/converters/calendars/__init__.py @@ -1,3 +1,4 @@ +from undate.converters.calendars.gregorian import GregorianDateConverter from undate.converters.calendars.hijri import HijriDateConverter -__all__ = ["HijriDateConverter"] +__all__ = ["HijriDateConverter", "GregorianDateConverter"] diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py new file mode 100644 index 0000000..2db1156 --- /dev/null +++ b/src/undate/converters/calendars/gregorian.py @@ -0,0 +1,41 @@ +from calendar import monthrange +from typing import Optional + +from undate.converters.base import BaseDateConverter + + +class GregorianDateConverter(BaseDateConverter): + """ + Converter class for Gregorian calendar. + """ + + #: converter name: Gregorian + name: str = "Gregorian" + calendar_name: str = "Gregorian" + + #: known non-leap year + NON_LEAP_YEAR: int = 2022 + + def max_month(self, year: int) -> int: + """Maximum month for this calendar for this year""" + return 12 + + def max_day(self, year: Optional[int] = None, month: Optional[int] = None) -> int: + # if month is known, use that to calculate + if month: + # if year is known, use it; otherwise use a known non-leap year + # (only matters for February) + year = year or self.NON_LEAP_YEAR + + # Use monthrange from python builtin calendar module. + # returns first day of the month and number of days in the month + # for the specified year and month. + _, max_day = monthrange(year, month) + else: + # if year and month are unknown, return maximum possible + max_day = 31 + + return max_day + + def to_gregorian(self, year, month, day) -> tuple[int, int, int]: + return (year, month, day) diff --git a/src/undate/converters/calendars/hijri/converter.py b/src/undate/converters/calendars/hijri/converter.py index e040d6e..5c694f8 100644 --- a/src/undate/converters/calendars/hijri/converter.py +++ b/src/undate/converters/calendars/hijri/converter.py @@ -1,5 +1,6 @@ from typing import Union +from convertdate import islamic # type: ignore from lark.exceptions import UnexpectedCharacters from undate.converters.base import BaseDateConverter @@ -23,6 +24,20 @@ class HijriDateConverter(BaseDateConverter): def __init__(self): self.transformer = HijriDateTransformer() + def max_month(self, year: int) -> int: + """maximum numeric month for the specified year in this calendar""" + return 12 + + def max_day(self, year: int, month: int) -> int: + """maximum numeric day for the specified year and month in this calendar""" + return islamic.month_length(year, month) + + def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: + """Convert a Hijri date, specified by year, month, and day, + to the Gregorian equivalent date. Returns a tuple of year, month, day. + """ + return islamic.to_gregorian(year, month, day) + def parse(self, value: str) -> Union[Undate, UndateInterval]: """ Parse a Hijri date string and return an :class:`~undate.undate.Undate` or diff --git a/src/undate/converters/calendars/hijri/transformer.py b/src/undate/converters/calendars/hijri/transformer.py index cbb9387..b575df9 100644 --- a/src/undate/converters/calendars/hijri/transformer.py +++ b/src/undate/converters/calendars/hijri/transformer.py @@ -1,5 +1,4 @@ from lark import Transformer, Tree -from convertdate import islamic # type: ignore from undate.undate import Undate, Calendar @@ -23,22 +22,12 @@ def hijri_date(self, items): value = int(child.children[0]) parts[str(child.data)] = value - print(f"*** initializing undate with {parts} and Hijri calendar") + # initialize and return an undate with islamic year, month, day and + # islamic calendar return HijriUndate(**parts) - # # if we have a year, month, day, convert to a single undate - # if len(parts.values()) == 3: - # # convertdate returns a tuple of year, month day - # converted_date = islamic.to_gregorian(**parts) - # return Undate(*converted_date) - - # # if not, convert to a date range - # start, end = islamic_to_gregorian_interval(**parts) - # # TODO: should we add optional date precision / interval length - # # to UndateInteravl ? - # return UndateInterval(Undate(*start), Undate(*end)) - - # this does nothing + # year translation is not needed since we want a tree with name year + # this is equivalent to a no-op # def year(self, items): # return Tree(data="year", children=[items[0]]) @@ -49,27 +38,3 @@ def month(self, items): tree = items[0] month_n = tree.data.split("_")[-1] return Tree(data="month", children=[month_n]) - - -MIN_MONTH, MIN_DAY = 1, 1 -MAX_MONTH = 12 - - -def islamic_to_gregorian_interval(year, month=None, day=None): - """Convert partial Hijri date to a Gregorian date interval. - - Args: - year (int): Hijri year - month (int, optional): Hijri month (1-12) - day (int, optional): Hijri day (1-30) - - Returns: - tuple: (start_date, end_date) as tuples of (year, month, day) - """ - start = (year, month or MIN_MONTH, day or MIN_DAY) - end_month = month or MAX_MONTH - # islamic calendar converter has month_length - if day is None: - day = islamic.month_length(year, end_month) - end = (year, end_month, day) # Use end_month instead of redundant check - return (islamic.to_gregorian(*start), islamic.to_gregorian(*end)) diff --git a/src/undate/undate.py b/src/undate/undate.py index be5a813..8500bf8 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,6 +1,5 @@ import datetime import re -from calendar import monthrange from enum import StrEnum, auto # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None @@ -16,6 +15,13 @@ class Calendar(StrEnum): GREGORIAN = auto() HIJRI = auto() + @staticmethod + def get_converter(calendar): + # calendar converter must be available with a name matching + # the title-case name of the calendar enum entry + converter_cls = BaseDateConverter.available_converters()[calendar.value.title()] + return converter_cls() + class Undate: """object for representing uncertain, fuzzy or partially unknown dates""" @@ -36,8 +42,6 @@ class Undate: #: the calendar this date is using; Gregorian by default calendar: Calendar = Calendar.GREGORIAN - #: known non-leap year - NON_LEAP_YEAR: int = 2022 # numpy datetime is stored as 64-bit integer, so min/max # depends on the time unit; assume days for now # See https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units @@ -72,10 +76,7 @@ def __init__( self.label = label if calendar is not None: self.set_calendar(calendar) - - # special case: treat year = XXXX as unknown/none - if year == "XXXX": - year = None + self.calendar_converter = Calendar.get_converter(self.calendar) self.calculate_earliest_latest(year, month, day) @@ -88,6 +89,9 @@ def __init__( self.converter = converter def calculate_earliest_latest(self, year, month, day): + # special case: treat year = XXXX as unknown/none + if year == "XXXX": + year = None if year is not None: # could we / should we use str.isnumeric here? try: @@ -107,15 +111,14 @@ def calculate_earliest_latest(self, year, month, day): max_year = self.MAX_ALLOWABLE_YEAR # if month is passed in as a string but completely unknown, - # treat as none - # TODO: we should preserve this information somehow; - # difference between just a year and and an unknown month within a year - # maybe in terms of date precision ? + # treat as unknown/none (date precision already set in init) if month == "XX": month = None - min_month = 1 - max_month = 12 + min_month = 1 # is min month ever anything other than 1 ? + # get max month from the calendar, since it depends on the + # calendar and potentially the year (e.g. leap years in Hebrew Anno Mundi) + max_month = self.calendar_converter.max_month(max_year) if month is not None: try: # treat as an integer if we can @@ -128,11 +131,11 @@ def calculate_earliest_latest(self, year, month, day): min_month, max_month = self._missing_digit_minmax( str(month), min_month, max_month ) - # similar to month above — unknown day, but day-level granularity if day == "XX": day = None + # if day is numeric, use as is if isinstance(day, int) or isinstance(day, str) and day.isnumeric(): day = int(day) # update initial value - fully known day @@ -140,29 +143,31 @@ def calculate_earliest_latest(self, year, month, day): min_day = max_day = day else: # if we have no day or partial day, calculate min / max - min_day = 1 - # if we know year and month (or max month), calculate exactly - if year and month and isinstance(year, int): - _, max_day = monthrange(int(year), max_month) - elif year is None and month: - # If we don't have year and month, - # calculate based on a known non-leap year - # (better than just setting 31, but still not great) - _, max_day = monthrange(self.NON_LEAP_YEAR, max_month) - else: - max_day = 31 + min_day = 1 # is min day ever anything other than 1 ? + rel_year = year if year and isinstance(year, int) else None + # use month if it is an integer; otherwise use previusly determined + # max month (which may not be 12 depending if partially unknown) + rel_month = month if month and isinstance(month, int) else max_month + + max_day = self.calendar_converter.max_day(rel_year, rel_month) # if day is partially specified, narrow min/max further if day is not None: min_day, max_day = self._missing_digit_minmax(day, min_day, max_day) # TODO: special case, if we get a Feb 29 date with unknown year, - # must switch the min/max years to known leap years! + # should switch the min/max years to known leap years! # for unknowns, assume smallest possible value for earliest and # largest valid for latest - self.earliest = Date(min_year, min_month, min_day) - self.latest = Date(max_year, max_month, max_day) + # convert to Gregorian calendar so earliest/latest can always + # be used for comparison + self.earliest = Date( + *self.calendar_converter.to_gregorian(min_year, min_month, min_day) + ) + self.latest = Date( + *self.calendar_converter.to_gregorian(max_year, max_month, max_day) + ) def set_calendar(self, calendar: Union[str, Calendar]): if calendar is not None: @@ -432,6 +437,8 @@ def _missing_digit_minmax( # given a possible range, calculate min/max values for a string # with a missing digit + # TODO: test this method directly + # assuming two digit only (i.e., month or day) possible_values = [f"{n:02}" for n in range(min_val, max_val + 1)] # ensure input value has two digits diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py index f74d412..7ae3a55 100644 --- a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py @@ -1,43 +1,77 @@ import pytest from undate.converters.calendars import HijriDateConverter -from undate.undate import Undate, Calendar -from undate.date import DatePrecision +from undate.converters.calendars.hijri.transformer import HijriUndate +from undate.undate import Calendar +from undate.date import DatePrecision, Date class TestHijriDateConverter: - def test_parse_(self): + def test_parse(self): # day # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE); Jumada I = month 5 date_str = "7 Jumādā I 1243" date = HijriDateConverter().parse(date_str) - assert date == Undate(1243, 5, 7) + assert date == HijriUndate(1243, 5, 7) assert date.calendar == Calendar.HIJRI assert date.precision == DatePrecision.DAY assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" - # TODO: earliest/latest should be converted to Gregorian - # assert date.earliest == Date(1827, 11, 26) - # assert date.latest == Date(1827, 11, 26) # month date_str = "Rajab 495" date = HijriDateConverter().parse(date_str) - assert date == Undate(495, 7) # Rajab is month 7 + assert date == HijriUndate(495, 7) # Rajab is month 7 assert date.calendar == Calendar.HIJRI assert date.precision == DatePrecision.MONTH assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" - # TODO: Gregorian earliest/ latest - # assert date == UndateInterval(Undate(1102, 4, 28), Undate(1102, 5, 27)) + # Gregorian earliest/ latest + assert date.earliest == Date(1102, 4, 28) + assert date.latest == Date(1102, 5, 27) # year date_str = "441" date = HijriDateConverter().parse(date_str) - assert date == Undate(441) + assert date == HijriUndate(441) assert date.calendar == Calendar.HIJRI assert date.precision == DatePrecision.YEAR assert date.label == f"{date_str} {HijriDateConverter.calendar_name}" - # TODO: Gregorian earliest/ latest - # assert date == UndateInterval(Undate(1049, 6, 11), Undate(1050, 5, 31)) + # Gregorian earliest/ latest + assert date.earliest == Date(1049, 6, 11) + assert date.latest == Date(1050, 5, 31) + + def test_gregorian_earliest_latest(self): + # earliest/latest should be converted to Gregorian for comparison + + # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE); Jumada I = month 5 + date = HijriUndate(1243, 5, 7) + assert date.earliest == Date(1827, 11, 26) + assert date.latest == Date(1827, 11, 26) + + # Jumādā I 1243 : 1827-11-20 to 1827-12-19 + date = HijriUndate(1243, 5) + assert date.earliest == Date(1827, 11, 20) + assert date.latest == Date(1827, 12, 19) + + # Rajab 495: 1102-04-28 to 1102-05-27 (Rajab = month 7) + date = HijriUndate(495, 7) + assert date.earliest == Date(1102, 4, 28) + assert date.latest == Date(1102, 5, 27) + + # 441 : 1049-06-11 to 1050-05-31 + date = HijriUndate(441) + assert date.earliest == Date(1049, 6, 11) + assert date.latest == Date(1050, 5, 31) + + # examples from ISMI data (reformatted to day month year) + # 14 Rabīʿ I 901 : 1495-12-11 (Rabi 1 = month 3 ) + date = HijriUndate(901, 3, 14) + assert date.earliest == Date(1495, 12, 11) + assert date.latest == Date(1495, 12, 11) + + # 884 : 1479-04-03 to 1480-03-21 + date = HijriUndate(884) + assert date.earliest == Date(1479, 4, 3) + assert date.latest == Date(1480, 3, 21) def test_parse_error(self): # a string we can't parse should raise an error diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py index 0cb5aa0..7ebc117 100644 --- a/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_transformer.py @@ -18,20 +18,16 @@ def test_hijri_undate(): # Monday, 7 Jumādā I 1243 Hijrī (26 November, 1827 CE); Jumada I = month 5 ("7 Jumādā I 1243", HijriUndate(1243, 5, 7), DatePrecision.DAY), ("Jumādā I 1243", HijriUndate(1243, 5), DatePrecision.MONTH), - # Gregorian: UndateInterval(Undate(1827, 11, 20), Undate(1827, 12, 19)) ("1243", HijriUndate(1243), DatePrecision.YEAR), # Gregorian: UndateInterval(Undate(1827, 7, 25), Undate(1828, 7, 13)), # Zū al-Qaʿdah / Dhu l-Qa'da = month 11 ("27 Dhū l-Qaʿda 632", HijriUndate(632, 11, 27), DatePrecision.DAY), # Rajab = month 7 ("Rajab 495", HijriUndate(495, 7), DatePrecision.MONTH), - # Gregorian: UndateInterval(Undate(1102, 4, 28), Undate(1102, 5, 27)), ("441", HijriUndate(441), DatePrecision.YEAR), - # Gregorian: UndateInterval(Undate(1049, 6, 11), Undate(1050, 5, 31)), # examples from ISMI data (reformatted to day month year) # Rabi 1 = month 3 ("14 Rabīʿ I 901", HijriUndate(901, 3, 14), DatePrecision.DAY), - # Gregorian: Undate(1495, 12, 11) ("884", HijriUndate(884), DatePrecision.YEAR), # Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)), # add when we support parsing ranges: diff --git a/tests/test_undate.py b/tests/test_undate.py index fd4c169..11ea550 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -2,6 +2,8 @@ from datetime import date import pytest + +from undate.converters.base import BaseDateConverter from undate.date import DatePrecision, Timedelta from undate.undate import Undate, UndateInterval, Calendar @@ -563,3 +565,11 @@ def test_duration(self): # one year set and the other not currently raises not implemented error with pytest.raises(NotImplementedError): UndateInterval(Undate(2000), Undate()).duration() + + +def test_calendar_get_converter(): + # ensure we can retrieve a calendar converter for each + # calendar named in our calendar enum + for cal in Calendar: + converter = Calendar.get_converter(cal) + assert isinstance(converter, BaseDateConverter) From fe415452b4f79ca7ba2c781ad4fa8092c39d077b Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 16:44:48 -0500 Subject: [PATCH 16/36] Generate iso format date from native calendar date, not earliest/latest --- src/undate/converters/iso8601.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/undate/converters/iso8601.py b/src/undate/converters/iso8601.py index a0ecad5..09399eb 100644 --- a/src/undate/converters/iso8601.py +++ b/src/undate/converters/iso8601.py @@ -77,19 +77,33 @@ def _undate_to_string(self, undate: Undate) -> str: # TODO: may want to refactor and take advantage of the year/month/day properties # added for use in EDTF formatter code for date_portion, iso_format in self.iso_format.items(): + # is known means fully known, means guaranteed integer if undate.is_known(date_portion): # NOTE: datetime strftime for %Y for 3-digit year # results in leading zero in some environments # and not others; force year to always be 4 digits - if date_portion == "year": - date_parts.append("%04d" % undate.earliest.year) - elif date_portion == "month" and undate.earliest.month: - date_parts.append("%02d" % undate.earliest.month) - elif date_portion == "day" and undate.earliest.day: - date_parts.append("%02d" % undate.earliest.day) # type: ignore + if date_portion == "year" and undate.year: + try: + date_parts.append("%04d" % int(undate.year)) + except ValueError: + # shouldn't happen because of is_known + date_parts.append(undate.year) + elif date_portion == "month" and undate.month: + try: + date_parts.append("%02d" % int(undate.month)) + except ValueError: + # shouldn't happen because of is_known + date_parts.append(undate.month) + elif date_portion == "day" and undate.day: + try: + date_parts.append("%02d" % int(undate.day)) + except ValueError: + # shouldn't happen because of is_known + date_parts.append(undate.day) elif date_portion == "year": - # if not known but this is year, add '-' for --MM-DD unknown year format + # if year is not known, add '-' for year portion, + # to genereate --MM-DD unknown year format date_parts.append("-") # TODO: fix type error: "list[str | None]" is incompatible with "Iterable[str]" return "-".join(date_parts) # type: ignore From 3a43e6dedfcbda38ac32c98fb65c0d07def9ba05 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 16:45:09 -0500 Subject: [PATCH 17/36] Include calendar name in undate repr --- src/undate/undate.py | 5 ++--- tests/test_undate.py | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/undate/undate.py b/src/undate/undate.py index 8500bf8..042fd9a 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -202,9 +202,8 @@ def __str__(self) -> str: return self.converter.to_string(self) def __repr__(self) -> str: - if self.label: - return "" % (self.label, self) - return "" % self + label_str = f" '{self.label}'" if self.label else "" + return f"" @classmethod def parse(cls, date_string, format) -> Union["Undate", "UndateInterval"]: diff --git a/tests/test_undate.py b/tests/test_undate.py index 11ea550..37c9af9 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -27,11 +27,12 @@ def test_partially_known_str(self): # assert str(Undate(2022, day=7)) == "2022-XX-07" @ currently returns 2022-07 def test_repr(self): - assert repr(Undate(2022, 11, 7)) == "" + assert repr(Undate(2022, 11, 7)) == "" assert ( repr(Undate(2022, 11, 7, label="A Special Day")) - == "" + == "" ) + assert repr(Undate(484, calendar=Calendar.HIJRI)) == "" def test_init_str(self): assert Undate("2000").earliest.year == 2000 From 7c9ccb7d745d079f68edb4156a9076837b60bfe1 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 17:06:50 -0500 Subject: [PATCH 18/36] Support and test comparing undates across calendars --- src/undate/undate.py | 17 +++++++-- .../test_hijri/test_hijri_converter.py | 36 ++++++++++++++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/undate/undate.py b/src/undate/undate.py index 042fd9a..108c56e 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -254,11 +254,15 @@ def __eq__(self, other: object) -> bool: if other is NotImplemented: return NotImplemented + # if both dates are fully known, then earliest/latest check + # is sufficient (and will work across calendars!) + # check for apparent equality + # - earliest/latest match and both have the same precision looks_equal = ( self.earliest == other.earliest and self.latest == other.latest - and self.initial_values == other.initial_values + and self.precision == other.precision ) # if everything looks the same, check for any unknowns in initial values # the same unknown date should NOT be considered equal @@ -268,8 +272,15 @@ def __eq__(self, other: object) -> bool: # in one format (i.e. X for missing digits). # If we support other formats, will need to normalize to common # internal format for comparison - if looks_equal and any("X" in str(val) for val in self.initial_values.values()): - return False + if looks_equal: + # if any part of either date that is known is _partially_ known, + # then these dates are not equal + if any( + [self.is_partially_known(p) for p in self.initial_values.keys()] + ) or any( + [other.is_partially_known(p) for p in other.initial_values.keys()] + ): + return False return looks_equal diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py index 7ae3a55..098b0f3 100644 --- a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py @@ -2,7 +2,7 @@ from undate.converters.calendars import HijriDateConverter from undate.converters.calendars.hijri.transformer import HijriUndate -from undate.undate import Calendar +from undate.undate import Calendar, Undate from undate.date import DatePrecision, Date @@ -80,3 +80,37 @@ def test_parse_error(self): # empty string should also error with pytest.raises(ValueError): HijriDateConverter().parse("") + + def test_compare_across_calendars(self): + # only day-precision dates can be exactly equal across calendars + + # 7 Jumādā I 1243 Hijrī : 26 November, 1827; Jumada I = month 5 + assert HijriUndate(1243, 5, 7) == Undate(1827, 11, 26) + # 14 Rabīʿ I 901 : 1495-12-11 (Rabi 1 = month 3 ) + assert HijriUndate(901, 3, 14) == Undate(1495, 12, 11) + + # greater than / less than + assert HijriUndate(901) < Undate(1500) + assert HijriUndate(901) > Undate(1450) + # Jumādā I 1243 : 1827-11-20 to 1827-12-19 + assert HijriUndate(1243, 5) > Undate(1827, 10) + assert HijriUndate(1243, 5) < Undate(1828, 1) + + # 7 Jumādā I 1243 Hijrī : 26 November, 1827, so it falls + # within (or is contained by) November 1827 + assert HijriUndate(1243, 5, 7) in Undate(1827, 11) + assert HijriUndate(1243, 5, 7) not in Undate(1827, 10) + + # sorting + sorted_dates = sorted( + [ + HijriUndate(884), # 1479 to 1480 Gregorian + HijriUndate(441), # 1049 to 1050 Gregorian + HijriUndate(901), # 1495 to 1495 Gregorian + Undate(1995), + Undate(33), + Undate(1350), + ] + ) + expected_gregorian_years = [33, 1049, 1350, 1479, 1495, 1995] + assert [d.earliest.year for d in sorted_dates] == expected_gregorian_years From b6b6376ad33b14bcbced8f08dfec6b237d3ace9d Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 17:19:39 -0500 Subject: [PATCH 19/36] Work around StrEnum not being in python until 3.11 --- pyproject.toml | 2 +- src/undate/undate.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index da206e4..f1ad9a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ readme = "README.md" license = { text = "Apache-2" } requires-python = ">= 3.9" dynamic = ["version"] -dependencies = ["lark[interegular]", "numpy", "convertdate"] +dependencies = ["lark[interegular]", "numpy", "convertdate", "strenum; python_version < '3.11'"] authors = [ { name = "Rebecca Sutton Koeser" }, { name = "Cole Crawford" }, diff --git a/src/undate/undate.py b/src/undate/undate.py index 108c56e..8a10073 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,6 +1,14 @@ import datetime import re -from enum import StrEnum, auto + +from enum import auto + +try: + # StrEnum was only added in python 3.11 + from enum import StrEnum +except ImportError: + # for python 3.10 or earlier, use third-party package + from strenum import StrEnum # type: ignore # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Dict, Optional, Union From e91b7ba00b8d47d0977f02921c5f091d5c94288c Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 17:25:57 -0500 Subject: [PATCH 20/36] Allow any Hijri year (drop 3+ digit year constraint and year-month-day) --- src/undate/converters/calendars/hijri/hijri.lark | 10 ++++------ src/undate/converters/calendars/hijri/parser.py | 4 ++-- .../test_calendars/test_hijri/test_hijri_parser.py | 12 ++++++++---- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/undate/converters/calendars/hijri/hijri.lark b/src/undate/converters/calendars/hijri/hijri.lark index ae8d73e..4e6ccc7 100644 --- a/src/undate/converters/calendars/hijri/hijri.lark +++ b/src/undate/converters/calendars/hijri/hijri.lark @@ -1,9 +1,9 @@ %import common.WS %ignore WS -hijri_date: year | month year | day month year | year month | year month day -// NOTE: ISMI sample dates are year month day -// if we can assume years are 3 digits minimum, we can support year month day AND we can use faster LALR parser +// only support day month year format for now +// parser requires numeric day and year to be distinguished based on order +hijri_date: day month year | month year | year // TODO: handle date ranges? @@ -11,9 +11,7 @@ hijri_date: year | month year | day month year | year month | year month day // PGP dates use qualifiers like "first decade of" (for beginning of month) // "first third of", seasons (can look for more examples) -// TODO: is there a minimum year we need to support? -// if we can assume 3+ digit years we can distinguish between days and years, -year: /\d{3,}/ +year: /\d+/ // months month: month_1 diff --git a/src/undate/converters/calendars/hijri/parser.py b/src/undate/converters/calendars/hijri/parser.py index df13a40..273cdf9 100644 --- a/src/undate/converters/calendars/hijri/parser.py +++ b/src/undate/converters/calendars/hijri/parser.py @@ -5,5 +5,5 @@ grammar_path = pathlib.Path(__file__).parent / "hijri.lark" with open(grammar_path) as grammar: - # NOTE: LALR parser is faster but requires assumption of 3+ digit years - hijri_parser = Lark(grammar.read(), start="hijri_date", strict=True, parser="lalr") + # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates + hijri_parser = Lark(grammar.read(), start="hijri_date", strict=True) diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py index 9d465ef..6b9c828 100644 --- a/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_parser.py @@ -39,10 +39,11 @@ "7 Jumādā I 1243", "29 Muḥarram 1243", "30 Muḥarram 1243", - # year month, if we can assume 3+ digit years - "901 Rabīʿ I", - # year month day - "901 Rabīʿ I 12", + "Rabīʿ I 901", + "12 Rabīʿ I 901", + # two and 1 digit years + "12 Rabīʿ I 90", + "12 Rabīʿ I 9", ] @@ -63,6 +64,9 @@ def test_should_parse(date_string): "Foo 383", # wrong format "2024-10-02", + # year month day not supported + "901 Rabīʿ I", + "901 Rabīʿ I 12", ] From 6c6f09a35cb355f0b22fd7bd8b20d4293bcbecb4 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 17:35:44 -0500 Subject: [PATCH 21/36] Confirm hijri dates + partially unknown date behavior --- .../test_hijri/test_hijri_converter.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py index 098b0f3..6541586 100644 --- a/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py +++ b/tests/test_converters/test_calendars/test_hijri/test_hijri_converter.py @@ -81,6 +81,44 @@ def test_parse_error(self): with pytest.raises(ValueError): HijriDateConverter().parse("") + def test_partially_known(self): + # hijri dates get existing partially unknown behavior + unknown_month = HijriUndate(1243, "XX") + assert unknown_month.precision == DatePrecision.MONTH + assert unknown_month.earliest == Date( + *HijriDateConverter().to_gregorian(1243, 1, 1) + ) + assert unknown_month.latest == Date( + *HijriDateConverter().to_gregorian(1243, 12, 30) + ) + + partially_unknown_month = HijriUndate(1243, "1X") + assert partially_unknown_month.precision == DatePrecision.MONTH + assert partially_unknown_month.earliest == Date( + *HijriDateConverter().to_gregorian(1243, 10, 1) + ) + assert partially_unknown_month.latest == Date( + *HijriDateConverter().to_gregorian(1243, 12, 30) + ) + + unknown_day = HijriUndate(1243, 2, "XX") + assert unknown_day.precision == DatePrecision.DAY + assert unknown_day.earliest == Date( + *HijriDateConverter().to_gregorian(1243, 2, 1) + ) + # second month has 29 days + assert unknown_day.latest == Date( + *HijriDateConverter().to_gregorian(1243, 2, 29) + ) + partially_unknown_day = HijriUndate(1243, 2, "2X") + assert partially_unknown_day.precision == DatePrecision.DAY + assert partially_unknown_day.earliest == Date( + *HijriDateConverter().to_gregorian(1243, 2, 20) + ) + assert partially_unknown_day.latest == Date( + *HijriDateConverter().to_gregorian(1243, 2, 29) + ) + def test_compare_across_calendars(self): # only day-precision dates can be exactly equal across calendars From 5cc19fdabefc8b32a57864d169ba79c705dcbbd4 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 18:07:25 -0500 Subject: [PATCH 22/36] Add calendar converter base class and document how to add calendars --- src/undate/converters/base.py | 66 +++++++++++++++++-- src/undate/converters/calendars/gregorian.py | 14 ++-- .../converters/calendars/hijri/converter.py | 4 +- tests/test_converters/test_base.py | 4 +- tests/test_undate.py | 5 +- 5 files changed, 77 insertions(+), 16 deletions(-) diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index 02cf820..ecdbf9b 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -1,10 +1,11 @@ """ -:class:`undate.converters.BaseDateConverter` provides a base class for +:class:`~undate.converters.BaseDateConverter` provides a base class for implementing date converters, which can provide support for -parsing and generating dates in different formats and also converting -dates between different calendars. +parsing and generating dates in different formats. +The converter subclass :class:`undate.converters.BaseCalendarConverter` +provides additional functionaly needed for calendar conversion. -To add support for a new date format or calendar conversion: +To add support for a new date converter: - Create a new file under ``undate/converters/`` - For converters with sufficient complexity, you may want to create a submodule; @@ -18,6 +19,25 @@ The new subclass should be loaded automatically and included in the converters returned by :meth:`BaseDateConverter.available_converters` +To add support for a new calendar converter: + +- Create a new file under ``undate/converters/calendars/`` + - For converters with sufficient complexity, you may want to create a submodule; + see ``undate.converters.calendars.hijri`` for an example. +- Extend ``BaseCalendarConverter`` and implement ``parse`` and ``to_string`` + formatter methods as desired/appropriate for your converter as well as the + additional methods for ``max_month``, ``max_day``, and convertion ``to_gregorian`` + calendar. +- Add unit tests for the new calendar logic under ``tests/test_converters/calendars/`` +- Add the new calendar to the ``Calendar`` enum of supported calendars in + ``undate/undate.py`` and confirm that the `get_converter` method loads your + calendar converter correctly (an existing unit test should cover this). +- Consider creating a notebook to demonstrate the use of the calendar + converter. + +Calendar converter subclasses are also automatically loaded and included +in the list of available converters. + ------------------- """ @@ -90,6 +110,42 @@ def available_converters(cls) -> Dict[str, Type["BaseDateConverter"]]: """ Dictionary of available converters keyed on name. """ + return {c.name: c for c in cls.subclasses()} # type: ignore + + @classmethod + def subclasses(cls) -> list[Type["BaseDateConverter"]]: + """ + List of available converters classes. Includes calendar convert + subclasses. + """ # ensure undate converters are imported cls.import_converters() - return {c.name: c for c in cls.__subclasses__()} # type: ignore + + # find all direct subclasses, excluding base calendar converter + subclasses = cls.__subclasses__() + subclasses.remove(BaseCalendarConverter) + # add all subclasses of calendar converter base class + subclasses.extend(BaseCalendarConverter.__subclasses__()) + return subclasses + + +class BaseCalendarConverter(BaseDateConverter): + """Base class for calendar converters, with additional methods required + for calendars.""" + + #: Converter name. Subclasses must define a unique name. + name: str = "Base Calendar Converter" + + def max_month(self, year: int) -> int: + """Maximum month for this calendar for this year""" + raise NotImplementedError + + def max_day(self, year: int, month: int) -> int: + """maximum numeric day for the specified year and month in this calendar""" + raise NotImplementedError + + def to_gregorian(self, year, month, day) -> tuple[int, int, int]: + """Convert a date for this calendar specified by numeric year, month, and day, + into the Gregorian equivalent date. Should return a tuple of year, month, day. + """ + raise NotImplementedError diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py index 2db1156..f794329 100644 --- a/src/undate/converters/calendars/gregorian.py +++ b/src/undate/converters/calendars/gregorian.py @@ -1,12 +1,11 @@ from calendar import monthrange -from typing import Optional -from undate.converters.base import BaseDateConverter +from undate.converters.base import BaseCalendarConverter -class GregorianDateConverter(BaseDateConverter): +class GregorianDateConverter(BaseCalendarConverter): """ - Converter class for Gregorian calendar. + Calendar onverter class for Gregorian calendar. """ #: converter name: Gregorian @@ -20,7 +19,8 @@ def max_month(self, year: int) -> int: """Maximum month for this calendar for this year""" return 12 - def max_day(self, year: Optional[int] = None, month: Optional[int] = None) -> int: + def max_day(self, year: int, month: int) -> int: + """maximum numeric day for the specified year and month in this calendar""" # if month is known, use that to calculate if month: # if year is known, use it; otherwise use a known non-leap year @@ -38,4 +38,8 @@ def max_day(self, year: Optional[int] = None, month: Optional[int] = None) -> in return max_day def to_gregorian(self, year, month, day) -> tuple[int, int, int]: + """Convert a Hijri date, specified by year, month, and day, + to the Gregorian equivalent date. Returns a tuple of year, month, day. + """ + return (year, month, day) diff --git a/src/undate/converters/calendars/hijri/converter.py b/src/undate/converters/calendars/hijri/converter.py index 5c694f8..9a8ad72 100644 --- a/src/undate/converters/calendars/hijri/converter.py +++ b/src/undate/converters/calendars/hijri/converter.py @@ -3,13 +3,13 @@ from convertdate import islamic # type: ignore from lark.exceptions import UnexpectedCharacters -from undate.converters.base import BaseDateConverter +from undate.converters.base import BaseCalendarConverter from undate.converters.calendars.hijri.parser import hijri_parser from undate.converters.calendars.hijri.transformer import HijriDateTransformer from undate.undate import Undate, UndateInterval -class HijriDateConverter(BaseDateConverter): +class HijriDateConverter(BaseCalendarConverter): """ Converter for Hijri / Islamic calendar. diff --git a/tests/test_converters/test_base.py b/tests/test_converters/test_base.py index 60d5d1e..1426f13 100644 --- a/tests/test_converters/test_base.py +++ b/tests/test_converters/test_base.py @@ -18,7 +18,7 @@ def test_available_converters(self): def test_converters_are_unique(self): assert len(BaseDateConverter.available_converters()) == len( - BaseDateConverter.__subclasses__() + BaseDateConverter.subclasses() ), "Formatter names have to be unique." def test_parse_not_implemented(self): @@ -60,5 +60,5 @@ class ISO8601DateFormat2(BaseDateConverter): name = "ISO8601" # duplicates existing formatter assert len(BaseDateConverter.available_converters()) != len( - BaseDateConverter.__subclasses__() + BaseDateConverter.subclasses() ) diff --git a/tests/test_undate.py b/tests/test_undate.py index 37c9af9..ecf0777 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -3,7 +3,7 @@ import pytest -from undate.converters.base import BaseDateConverter +from undate.converters.base import BaseCalendarConverter from undate.date import DatePrecision, Timedelta from undate.undate import Undate, UndateInterval, Calendar @@ -573,4 +573,5 @@ def test_calendar_get_converter(): # calendar named in our calendar enum for cal in Calendar: converter = Calendar.get_converter(cal) - assert isinstance(converter, BaseDateConverter) + assert isinstance(converter, BaseCalendarConverter) + assert converter.name.lower() == cal.name.lower() From d26574c94c76d0f57117aaff86decfc742b12021 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 19:21:24 -0500 Subject: [PATCH 23/36] Implementing Hebrew Anno Mundi calendar converter based on Hijri --- src/undate/converters/base.py | 11 +- src/undate/converters/calendars/gregorian.py | 4 - .../converters/calendars/hebrew/__init__.py | 3 + .../converters/calendars/hebrew/converter.py | 71 +++++++++ .../converters/calendars/hebrew/hebrew.lark | 55 +++++++ .../converters/calendars/hebrew/parser.py | 9 ++ .../calendars/hebrew/transformer.py | 40 +++++ .../converters/calendars/hijri/__init__.py | 3 +- .../converters/calendars/hijri/converter.py | 8 +- src/undate/undate.py | 14 +- .../test_hebrew/test_hebrew_converter.py | 142 ++++++++++++++++++ .../test_hebrew/test_hebrew_parser.py | 63 ++++++++ .../test_hebrew/test_hebrew_transformer.py | 43 ++++++ 13 files changed, 447 insertions(+), 19 deletions(-) create mode 100644 src/undate/converters/calendars/hebrew/__init__.py create mode 100644 src/undate/converters/calendars/hebrew/converter.py create mode 100644 src/undate/converters/calendars/hebrew/hebrew.lark create mode 100644 src/undate/converters/calendars/hebrew/parser.py create mode 100644 src/undate/converters/calendars/hebrew/transformer.py create mode 100644 tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py create mode 100644 tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py create mode 100644 tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index ecdbf9b..14bff87 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -28,6 +28,7 @@ formatter methods as desired/appropriate for your converter as well as the additional methods for ``max_month``, ``max_day``, and convertion ``to_gregorian`` calendar. +- Import your calendar in ``undate/converters/calendars/__init__.py`` and include in `__all__`` - Add unit tests for the new calendar logic under ``tests/test_converters/calendars/`` - Add the new calendar to the ``Calendar`` enum of supported calendars in ``undate/undate.py`` and confirm that the `get_converter` method loads your @@ -136,9 +137,13 @@ class BaseCalendarConverter(BaseDateConverter): #: Converter name. Subclasses must define a unique name. name: str = "Base Calendar Converter" - def max_month(self, year: int) -> int: - """Maximum month for this calendar for this year""" - raise NotImplementedError + def min_month(self) -> int: + """First month for this calendar. Defaults to 1.""" + return 1 + + def max_month(self) -> int: + """Last month for this calendar. Defaults to 12.""" + return 12 def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py index f794329..9a3e2a9 100644 --- a/src/undate/converters/calendars/gregorian.py +++ b/src/undate/converters/calendars/gregorian.py @@ -15,10 +15,6 @@ class GregorianDateConverter(BaseCalendarConverter): #: known non-leap year NON_LEAP_YEAR: int = 2022 - def max_month(self, year: int) -> int: - """Maximum month for this calendar for this year""" - return 12 - def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" # if month is known, use that to calculate diff --git a/src/undate/converters/calendars/hebrew/__init__.py b/src/undate/converters/calendars/hebrew/__init__.py new file mode 100644 index 0000000..4ac5b4b --- /dev/null +++ b/src/undate/converters/calendars/hebrew/__init__.py @@ -0,0 +1,3 @@ +from undate.converters.calendars.hijri.converter import HijriDateConverter + +__all__ = ["HijriDateConverter"] diff --git a/src/undate/converters/calendars/hebrew/converter.py b/src/undate/converters/calendars/hebrew/converter.py new file mode 100644 index 0000000..7d83dc7 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/converter.py @@ -0,0 +1,71 @@ +from typing import Union + +from convertdate import hebrew # type: ignore +from lark.exceptions import UnexpectedCharacters + +from undate.converters.base import BaseCalendarConverter +from undate.converters.calendars.hebrew.parser import hebrew_parser +from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer +from undate.undate import Undate, UndateInterval + + +class HebrewDateConverter(BaseCalendarConverter): + """ + Converter for Hebrew Anno Mundicalendar. + + Support for parsing Anno Mundi dates and converting to Undate and UndateInterval + objects in the Gregorian calendar. + """ + + #: converter name: Hebrew + name: str = "Hebrew" + calendar_name: str = "Anno Mundi" + + def __init__(self): + self.transformer = HebrewDateTransformer() + + def min_month(self) -> int: + """first numeric month for the specified year in this calendar""" + # hebrew calendar civil year starts in Tishri + return hebrew.TISHRI + + def max_month(self) -> int: + """last numeric month for the specified year in this calendar""" + # hebrew calendar civil year starts in Tishri + # Elul is the month before Tishri + return hebrew.ELUL + + def max_day(self, year: int, month: int) -> int: + """maximum numeric day for the specified year and month in this calendar""" + # NOTE: unreleased v2.4.1 of convertdate standardizes month_days to month_length + return hebrew.month_days(year, month) + + def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: + """Convert a Hebrew date, specified by year, month, and day, + to the Gregorian equivalent date. Returns a tuple of year, month, day. + """ + return hebrew.to_gregorian(year, month, day) + + def parse(self, value: str) -> Union[Undate, UndateInterval]: + """ + Parse a Hebrew date string and return an :class:`~undate.undate.Undate` or + :class:`~undate.undate.UndateInterval`. + The Hebrew date string is preserved in the undate label. + """ + if not value: + raise ValueError("Parsing empty string is not supported") + + # parse the input string, then transform to undate object + try: + # parse the string with our Hebrew date parser + parsetree = hebrew_parser.parse(value) + # transform the parse tree into an undate or undate interval + undate_obj = self.transformer.transform(parsetree) + # set the original date as a label, with the calendar name + undate_obj.label = f"{value} {self.calendar_name}" + return undate_obj + except UnexpectedCharacters as err: + raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err + + # do we need to support conversion the other direction? + # i.e., generate a Hebrew date from an abitrary undate or undate interval? diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark new file mode 100644 index 0000000..64e527b --- /dev/null +++ b/src/undate/converters/calendars/hebrew/hebrew.lark @@ -0,0 +1,55 @@ +%import common.WS +%ignore WS + +// only support day month year format for now +// parser requires numeric day and year to be distinguished based on order +hebrew_date: day month year | month year | year + +// TODO: handle date ranges? + +// TODO: add support for qualifiers? +// PGP dates use qualifiers like "first decade of" (for beginning of month) +// "first third of", seasons (can look for more examples) + +year: /\d+/ + +// months +month: month_1 + | month_2 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 + | month_13 +// months have 29 or 30 days; we do not expect leading zeroes +day: /[1-9]/ | /[12][0-9]/ | /30/ + +// months, in order; from convertdate list +// with variants from Princeton Geniza Project +// support matching with and without accents +month_1: "Nisan" +// Iyar or Iyyar +month_2: /Iyy?ar/ +month_3: "Sivan" +month_4: "Tammuz" +month_5: "Av" +month_6: "Elul" +// Tishrei or Tishri +month_7: /Tishre?i/ +month_8: "Heshvan" +month_9: "Kislev" +// Tevet or Teveth +month_10: /[ṬT]eveth?/ +month_11: "Shevat" +// Adar I or Adar +month_12: /Adar( I)?/ +// Adar II or Adar Bet +month_13: /Adar (II|Bet)/ + + diff --git a/src/undate/converters/calendars/hebrew/parser.py b/src/undate/converters/calendars/hebrew/parser.py new file mode 100644 index 0000000..5654f60 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/parser.py @@ -0,0 +1,9 @@ +import pathlib + +from lark import Lark + +grammar_path = pathlib.Path(__file__).parent / "hebrew.lark" + +with open(grammar_path) as grammar: + # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates + hebrew_parser = Lark(grammar.read(), start="hebrew_date", strict=True) diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py new file mode 100644 index 0000000..a6d2888 --- /dev/null +++ b/src/undate/converters/calendars/hebrew/transformer.py @@ -0,0 +1,40 @@ +from lark import Transformer, Tree + +from undate.undate import Undate, Calendar + + +class HebrewUndate(Undate): + """Undate convience subclass; sets default calendar to Hebrew.""" + + calendar = Calendar.HEBREW + + +class HebrewDateTransformer(Transformer): + """Transform a Hebrew date parse tree and return an Undate or + UndateInterval.""" + + def hebrew_date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one integer value; + # anonymous tokens convert to their value and cast as int + value = int(child.children[0]) + parts[str(child.data)] = value + + # initialize and return an undate with islamic year, month, day and + # islamic calendar + return HebrewUndate(**parts) + + # year translation is not needed since we want a tree with name year + # this is equivalent to a no-op + # def year(self, items): + # return Tree(data="year", children=[items[0]]) + + def month(self, items): + # month has a nested tree for the rule and the value + # the name of the rule (month_1, month_2, etc) gives us the + # number of the month needed for converting the date + tree = items[0] + month_n = tree.data.split("_")[-1] + return Tree(data="month", children=[month_n]) diff --git a/src/undate/converters/calendars/hijri/__init__.py b/src/undate/converters/calendars/hijri/__init__.py index 4ac5b4b..8c28d52 100644 --- a/src/undate/converters/calendars/hijri/__init__.py +++ b/src/undate/converters/calendars/hijri/__init__.py @@ -1,3 +1,4 @@ from undate.converters.calendars.hijri.converter import HijriDateConverter +from undate.converters.calendars.hebrew.converter import HebrewDateConverter -__all__ = ["HijriDateConverter"] +__all__ = ["HijriDateConverter", "HebrewDateConverter"] diff --git a/src/undate/converters/calendars/hijri/converter.py b/src/undate/converters/calendars/hijri/converter.py index 9a8ad72..910c67e 100644 --- a/src/undate/converters/calendars/hijri/converter.py +++ b/src/undate/converters/calendars/hijri/converter.py @@ -24,10 +24,6 @@ class HijriDateConverter(BaseCalendarConverter): def __init__(self): self.transformer = HijriDateTransformer() - def max_month(self, year: int) -> int: - """maximum numeric month for the specified year in this calendar""" - return 12 - def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" return islamic.month_length(year, month) @@ -41,8 +37,8 @@ def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: def parse(self, value: str) -> Union[Undate, UndateInterval]: """ Parse a Hijri date string and return an :class:`~undate.undate.Undate` or - :class:`~undate.undate.UndateInterval` in Gregorian calendar. - The Hijri date string is preserved in the undate label + :class:`~undate.undate.UndateInterval`. + The Hijri date string is preserved in the undate label. """ if not value: raise ValueError("Parsing empty string is not supported") diff --git a/src/undate/undate.py b/src/undate/undate.py index 8a10073..0c635c0 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -22,6 +22,7 @@ class Calendar(StrEnum): GREGORIAN = auto() HIJRI = auto() + HEBREW = auto() @staticmethod def get_converter(calendar): @@ -123,10 +124,11 @@ def calculate_earliest_latest(self, year, month, day): if month == "XX": month = None - min_month = 1 # is min month ever anything other than 1 ? - # get max month from the calendar, since it depends on the - # calendar and potentially the year (e.g. leap years in Hebrew Anno Mundi) - max_month = self.calendar_converter.max_month(max_year) + # get first and last month from the calendar, since it is not + # always 1 and 12 + # TODO need to differentiate between min/max and first/last! + min_month = self.calendar_converter.min_month() + max_month = self.calendar_converter.max_month() if month is not None: try: # treat as an integer if we can @@ -137,7 +139,9 @@ def calculate_earliest_latest(self, year, month, day): except ValueError: # if not, calculate min/max for missing digits min_month, max_month = self._missing_digit_minmax( - str(month), min_month, max_month + str(month), + 1, + 12, # min_month, max_month ) # similar to month above — unknown day, but day-level granularity if day == "XX": diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py new file mode 100644 index 0000000..f335975 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py @@ -0,0 +1,142 @@ +import pytest + +from undate.converters.calendars.hebrew.converter import HebrewDateConverter +from undate.converters.calendars.hebrew.transformer import HebrewUndate +from undate.undate import Calendar, Undate +from undate.date import DatePrecision, Date + + +class TestHebrewDateConverter: + def test_parse(self): + # day + # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056 Gregorian) + date_str = "26 Tammuz 4816" + date = HebrewDateConverter().parse(date_str) + assert date == HebrewUndate(4816, 4, 26) + assert date.calendar == Calendar.HEBREW + assert date.precision == DatePrecision.DAY + assert date.label == f"{date_str} {HebrewDateConverter.calendar_name}" + + # month + date_str = "Ṭevet 5362" + date = HebrewDateConverter().parse(date_str) + assert date == HebrewUndate(5362, 10) # Teveth = month 10 + assert date.calendar == Calendar.HEBREW + assert date.precision == DatePrecision.MONTH + assert date.label == f"{date_str} {HebrewDateConverter.calendar_name}" + + # year + date_str = "4932" + date = HebrewDateConverter().parse(date_str) + assert date == HebrewUndate(4932) + assert date.calendar == Calendar.HEBREW + assert date.precision == DatePrecision.YEAR + assert date.label == f"{date_str} {HebrewDateConverter.calendar_name}" + + def test_gregorian_earliest_latest(self): + # earliest/latest should be converted to Gregorian for comparison + + # full date + + # 26 Tammuz 4816: 17 July, 1056; Tammuz = month 4 + date = HebrewUndate(4816, 4, 26) + assert date.earliest == Date(1056, 7, 17) + assert date.latest == Date(1056, 7, 17) + # 13 Tishrei 5416 Anno Mundi (1655-10-14) + date = HebrewUndate(5416, 7, 13) # Tishrei = month 7 + assert date.earliest == Date(1655, 10, 14) + assert date.latest == Date(1655, 10, 14) + + # month + + # Ṭevet 5362 Anno Mundi (25 December, 1601 – 22 January, 1602) + date = HebrewUndate(5362, 10) + assert date.earliest == Date(1601, 12, 25) + assert date.latest == Date(1602, 1, 22) + + # year + # 5416 : October 1655 to September 1656 + date = HebrewUndate(5416) + assert date.earliest == Date(1655, 10, 2) + assert date.latest == Date(1656, 9, 18) + + def test_parse_error(self): + # a string we can't parse should raise an error + with pytest.raises(ValueError): + HebrewDateConverter().parse("January 2, 1991") + # empty string should also error + with pytest.raises(ValueError): + HebrewDateConverter().parse("") + + def test_partially_known(self): + # hebrew dates get existing partially unknown behavior + + converter = HebrewDateConverter() + + # hebrew first/last month are not the same as min/max + unknown_month = HebrewUndate(1243, "XX") + assert unknown_month.precision == DatePrecision.MONTH + assert unknown_month.earliest == Date( + *converter.to_gregorian(1243, converter.min_month(), 1) + ) + max_month = converter.max_month() + assert unknown_month.latest == Date( + *converter.to_gregorian(1243, max_month, converter.max_day(1243, max_month)) + ) + + partially_unknown_month = HebrewUndate(1243, "1X") + assert partially_unknown_month.precision == DatePrecision.MONTH + assert partially_unknown_month.earliest == Date( + *converter.to_gregorian(1243, 10, 1) + ) + assert partially_unknown_month.latest == Date( + *converter.to_gregorian(1243, 12, 30) + ) + + # second month has 29 days + unknown_day = HebrewUndate(1243, 2, "XX") + assert unknown_day.precision == DatePrecision.DAY + assert unknown_day.earliest == Date(*converter.to_gregorian(1243, 2, 1)) + assert unknown_day.latest == Date(*converter.to_gregorian(1243, 2, 29)) + + partially_unknown_day = HebrewUndate(1243, 2, "2X") + assert partially_unknown_day.precision == DatePrecision.DAY + assert partially_unknown_day.earliest == Date( + *converter.to_gregorian(1243, 2, 20) + ) + assert partially_unknown_day.latest == Date( + *converter.to_gregorian(1243, 2, 29) + ) + + def test_compare_across_calendars(self): + # only day-precision dates can be exactly equal across calendars + + # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056 Gregorian) + assert HebrewUndate(4816, 4, 26) == Undate(1056, 7, 17) + # 13 Tishrei 5416; Tieshrei = month 7 (1655-10-14) + assert HebrewUndate(5416, 7, 13) == Undate(1655, 10, 14) + + # greater than / less than + assert HebrewUndate(4816) < Undate(1060) + assert HebrewUndate(5416) < Undate(1660) + assert HebrewUndate(5416, 7) > Undate(1655, 1) + assert HebrewUndate(4816, 4, 26) > Undate(1055, 5) + + # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056) + # so it falls within or is c ontained by July 1056 + assert HebrewUndate(4816, 4, 26) in Undate(1056, 7) + assert HebrewUndate(4816, 4, 26) not in Undate(1054) + + # sorting + sorted_dates = sorted( + [ + HebrewUndate(4816, 4, 26), # 1056-07-17 + HebrewUndate(5416), # 1655 + HebrewUndate(500), # -3261 + Undate(1995), + Undate(33), + Undate(1350), + ] + ) + expected_gregorian_years = [-3261, 33, 1056, 1350, 1655, 1995] + assert [d.earliest.year for d in sorted_dates] == expected_gregorian_years diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py new file mode 100644 index 0000000..e4894b1 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py @@ -0,0 +1,63 @@ +import pytest +from undate.converters.calendars.hebrew.parser import hebrew_parser + + +# for now, just test that valid dates can be parsed + +testcases = [ + # year + "5362", + # month + year + # - with and without accent + "Ṭevet 5362", + "Tevet 5362", + "Elul 4932", + "Sivan 5581", + # variant month name, with or without accent + "Ṭeveth 5362", + "Teveth 5362", + "Iyyar 1526", + "Iyar 1526", + # day month year + "26 Tammuz 4816", + "7 Heshvan 5425", + "26 Tishrei 5416", + "26 Tishri 5416", + "14 Adar 5403", + "14 Adar I 5403", + "9 Adar II 5404", + "9 Adar Bet 5404", + # two and 1 digit years + "536", + "53", + "3", +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert hebrew_parser.parse(date_string) + + +error_cases = [ + # invalid days + "0 Tammuz 5403", + "31 Tishri 5403", + # month alone + "Tishri", + # month day only + "12 Heshvan", + # invalid month + "Foo 383", + # wrong format + "2024-10-02", + # year month day not supported + "5403 Adar", + "5403 Adar 14", +] + + +@pytest.mark.parametrize("date_string", error_cases) +def test_should_error(date_string): + with pytest.raises(Exception): + hebrew_parser.parse(date_string) diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py new file mode 100644 index 0000000..6e4a5e6 --- /dev/null +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py @@ -0,0 +1,43 @@ +import pytest +from undate.converters.calendars.hebrew.parser import hebrew_parser +from undate.converters.calendars.hebrew.transformer import ( + HebrewDateTransformer, + HebrewUndate, +) +from undate.undate import Undate, Calendar +from undate.date import DatePrecision + + +def test_hebrew_undate(): + assert HebrewUndate(848).calendar == Calendar.HEBREW + + +testcases = [ + # examples from Princeton Geniza Project + # date conversions checked with https://www.muqawwim.com/ + # 26 Tammuz 4816; Tammuz = month 4 + ("26 Tammuz 4816", HebrewUndate(4816, 4, 26), DatePrecision.DAY), + ("Tammuz 4816", HebrewUndate(4816, 4), DatePrecision.MONTH), + ("4816", HebrewUndate(4816), DatePrecision.YEAR), + # 26 Tishrei 5416: Tishrei = month 7 + ("26 Tishrei 5416", HebrewUndate(5416, 7, 26), DatePrecision.DAY), + # Ṭeveth = month 10 + ("Ṭevet 5362", HebrewUndate(5362, 10), DatePrecision.MONTH), + ("5362", HebrewUndate(5362), DatePrecision.YEAR), + # add when we support parsing ranges: + # Adar I and Adar II 5453 : (1693 CE) +] + + +@pytest.mark.parametrize("date_string,expected,expected_precision", testcases) +def test_transform(date_string, expected, expected_precision): + transformer = HebrewDateTransformer(visit_tokens=True) + # parse the input string, then transform to undate object + parsetree = hebrew_parser.parse(date_string) + transformed_date = transformer.transform(parsetree) + assert transformed_date == expected + # currently only undates have date precision + if isinstance(transformed_date, Undate): + assert transformed_date.precision == expected_precision + # transformer doesn't have access to date string, + # label will need to be set by the converter class From 5660fa2fd8c0f43e8779d46229f3b1d6fb713ef1 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 19:29:03 -0500 Subject: [PATCH 24/36] Fix mis-formatted docstring --- src/undate/converters/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index ecdbf9b..630c9f5 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -25,9 +25,9 @@ - For converters with sufficient complexity, you may want to create a submodule; see ``undate.converters.calendars.hijri`` for an example. - Extend ``BaseCalendarConverter`` and implement ``parse`` and ``to_string`` - formatter methods as desired/appropriate for your converter as well as the - additional methods for ``max_month``, ``max_day``, and convertion ``to_gregorian`` - calendar. + formatter methods as desired/appropriate for your converter as well as the + additional methods for ``max_month``, ``max_day``, and convertion ``to_gregorian`` + calendar. - Add unit tests for the new calendar logic under ``tests/test_converters/calendars/`` - Add the new calendar to the ``Calendar`` enum of supported calendars in ``undate/undate.py`` and confirm that the `get_converter` method loads your From c6ed8179e98ff3c58e61cbfd6fee649f7456b4b2 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 19:29:03 -0500 Subject: [PATCH 25/36] Fix mis-formatted docstring --- src/undate/converters/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index 14bff87..150fc5f 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -25,9 +25,9 @@ - For converters with sufficient complexity, you may want to create a submodule; see ``undate.converters.calendars.hijri`` for an example. - Extend ``BaseCalendarConverter`` and implement ``parse`` and ``to_string`` - formatter methods as desired/appropriate for your converter as well as the - additional methods for ``max_month``, ``max_day``, and convertion ``to_gregorian`` - calendar. + formatter methods as desired/appropriate for your converter as well as the + additional methods for ``max_month``, ``max_day``, and convertion ``to_gregorian`` + calendar. - Import your calendar in ``undate/converters/calendars/__init__.py`` and include in `__all__`` - Add unit tests for the new calendar logic under ``tests/test_converters/calendars/`` - Add the new calendar to the ``Calendar`` enum of supported calendars in From 88e4d1741d16a054775fa061e5c2d593f9be070c Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 26 Nov 2024 21:43:59 -0500 Subject: [PATCH 26/36] Adjust imports for hebrew calendar converter --- src/undate/converters/calendars/__init__.py | 3 ++- src/undate/converters/calendars/hebrew/__init__.py | 4 ++-- .../test_calendars/test_hebrew/test_hebrew_converter.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py index 635af21..c14e115 100644 --- a/src/undate/converters/calendars/__init__.py +++ b/src/undate/converters/calendars/__init__.py @@ -1,4 +1,5 @@ from undate.converters.calendars.gregorian import GregorianDateConverter from undate.converters.calendars.hijri import HijriDateConverter +from undate.converters.calendars.hebrew import HebrewDateConverter -__all__ = ["HijriDateConverter", "GregorianDateConverter"] +__all__ = ["HijriDateConverter", "GregorianDateConverter", "HebrewDateConverter"] diff --git a/src/undate/converters/calendars/hebrew/__init__.py b/src/undate/converters/calendars/hebrew/__init__.py index 4ac5b4b..e612ce3 100644 --- a/src/undate/converters/calendars/hebrew/__init__.py +++ b/src/undate/converters/calendars/hebrew/__init__.py @@ -1,3 +1,3 @@ -from undate.converters.calendars.hijri.converter import HijriDateConverter +from undate.converters.calendars.hebrew.converter import HebrewDateConverter -__all__ = ["HijriDateConverter"] +__all__ = ["HebrewDateConverter"] diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py index f335975..1c05632 100644 --- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py @@ -1,6 +1,6 @@ import pytest -from undate.converters.calendars.hebrew.converter import HebrewDateConverter +from undate.converters.calendars import HebrewDateConverter from undate.converters.calendars.hebrew.transformer import HebrewUndate from undate.undate import Calendar, Undate from undate.date import DatePrecision, Date From f908cd5465916c9ed6a1f1fd338ad94b4b1c35f3 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 6 Dec 2024 10:51:55 +0100 Subject: [PATCH 27/36] Add comment about earliest Hebrew year in grammar --- src/undate/converters/calendars/hebrew/hebrew.lark | 1 + 1 file changed, 1 insertion(+) diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark index 64e527b..b55ec3f 100644 --- a/src/undate/converters/calendars/hebrew/hebrew.lark +++ b/src/undate/converters/calendars/hebrew/hebrew.lark @@ -11,6 +11,7 @@ hebrew_date: day month year | month year | year // PGP dates use qualifiers like "first decade of" (for beginning of month) // "first third of", seasons (can look for more examples) +// Hebrew calendar starts with year 1 in 3761 BCE year: /\d+/ // months From c24cd34a6bfb6603c8639c299b8077b288678cb5 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 6 Dec 2024 10:52:40 +0100 Subject: [PATCH 28/36] Test exceptions and parser type errors more specific --- .../test_hebrew/test_hebrew_converter.py | 7 ++++++ .../test_hebrew/test_hebrew_parser.py | 24 ++++++++++--------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py index 1c05632..319b551 100644 --- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py @@ -68,6 +68,13 @@ def test_parse_error(self): with pytest.raises(ValueError): HebrewDateConverter().parse("") + # non-string input should raise a type error + with pytest.raises(TypeError): + HebrewDateConverter().parse(42) + + with pytest.raises(TypeError): + HebrewDateConverter().parse({"foo": "bar"}) + def test_partially_known(self): # hebrew dates get existing partially unknown behavior diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py index e4894b1..69b929e 100644 --- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py @@ -1,4 +1,6 @@ import pytest +from lark.exceptions import UnexpectedCharacters, UnexpectedEOF + from undate.converters.calendars.hebrew.parser import hebrew_parser @@ -41,23 +43,23 @@ def test_should_parse(date_string): error_cases = [ # invalid days - "0 Tammuz 5403", - "31 Tishri 5403", + ("0 Tammuz 5403", UnexpectedCharacters), + ("31 Tishri 5403", UnexpectedCharacters), # month alone - "Tishri", + ("Tishri", UnexpectedEOF), # month day only - "12 Heshvan", + ("12 Heshvan", UnexpectedEOF), # invalid month - "Foo 383", + ("Foo 383", UnexpectedCharacters), # wrong format - "2024-10-02", + ("2024-10-02", UnexpectedCharacters), # year month day not supported - "5403 Adar", - "5403 Adar 14", + ("5403 Adar", UnexpectedCharacters), + ("5403 Adar 14", UnexpectedCharacters), ] -@pytest.mark.parametrize("date_string", error_cases) -def test_should_error(date_string): - with pytest.raises(Exception): +@pytest.mark.parametrize("date_string,exception", error_cases) +def test_should_error(date_string, exception): + with pytest.raises(exception): hebrew_parser.parse(date_string) From 5773bf7d0848481adcc522b8f6ed2e2938fc606c Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 6 Dec 2024 10:53:45 +0100 Subject: [PATCH 29/36] Run unit tests on pull request to any branch --- .github/workflows/unit_tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 89df8cb..17a1c7a 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -8,6 +8,8 @@ on: - 'undate/**' - 'tests/**' pull_request: + branches: + - "**" env: # python version used to calculate and submit code coverage From 3032785dbf16de32ffcfc6ff51a54eb33bf7c406 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 6 Dec 2024 10:57:26 +0100 Subject: [PATCH 30/36] Fix incorrect import --- src/undate/converters/calendars/hijri/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/undate/converters/calendars/hijri/__init__.py b/src/undate/converters/calendars/hijri/__init__.py index 8c28d52..4ac5b4b 100644 --- a/src/undate/converters/calendars/hijri/__init__.py +++ b/src/undate/converters/calendars/hijri/__init__.py @@ -1,4 +1,3 @@ from undate.converters.calendars.hijri.converter import HijriDateConverter -from undate.converters.calendars.hebrew.converter import HebrewDateConverter -__all__ = ["HijriDateConverter", "HebrewDateConverter"] +__all__ = ["HijriDateConverter"] From 91376088979e277e55f82f5e554dc3b7b86c13c3 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 6 Dec 2024 10:57:43 +0100 Subject: [PATCH 31/36] Force calendar converters to implement min/max month methods --- src/undate/converters/base.py | 8 ++++---- src/undate/converters/calendars/gregorian.py | 8 ++++++++ src/undate/converters/calendars/hijri/converter.py | 8 ++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index 150fc5f..bcd90c2 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -138,12 +138,12 @@ class BaseCalendarConverter(BaseDateConverter): name: str = "Base Calendar Converter" def min_month(self) -> int: - """First month for this calendar. Defaults to 1.""" - return 1 + """First month for this calendar.""" + raise NotImplementedError def max_month(self) -> int: - """Last month for this calendar. Defaults to 12.""" - return 12 + """Last month for this calendar.""" + raise NotImplementedError def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py index 9a3e2a9..af8ea25 100644 --- a/src/undate/converters/calendars/gregorian.py +++ b/src/undate/converters/calendars/gregorian.py @@ -15,6 +15,14 @@ class GregorianDateConverter(BaseCalendarConverter): #: known non-leap year NON_LEAP_YEAR: int = 2022 + def min_month(self) -> int: + """First month for the Gregorian calendar.""" + return 1 + + def max_month(self) -> int: + """maximum numeric month for the specified year in the Gregorian calendar""" + return 12 + def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" # if month is known, use that to calculate diff --git a/src/undate/converters/calendars/hijri/converter.py b/src/undate/converters/calendars/hijri/converter.py index 910c67e..1cb7c82 100644 --- a/src/undate/converters/calendars/hijri/converter.py +++ b/src/undate/converters/calendars/hijri/converter.py @@ -28,6 +28,14 @@ def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" return islamic.month_length(year, month) + def min_month(self) -> int: + """First month for this calendar.""" + return 1 + + def max_month(self) -> int: + """maximum numeric month for the specified year in this calendar""" + return 12 + def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: """Convert a Hijri date, specified by year, month, and day, to the Gregorian equivalent date. Returns a tuple of year, month, day. From 920f7361fa21818ff57a217b5f9edbb144ca5f62 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 6 Dec 2024 11:34:03 +0100 Subject: [PATCH 32/36] Differentiate min/max month from first/last month --- src/undate/converters/base.py | 14 ++++++++--- src/undate/converters/calendars/gregorian.py | 2 +- .../converters/calendars/hebrew/converter.py | 19 ++++++++++----- .../converters/calendars/hijri/converter.py | 6 ++--- src/undate/undate.py | 24 +++++++++---------- .../test_hebrew/test_hebrew_converter.py | 14 +++++++---- 6 files changed, 50 insertions(+), 29 deletions(-) diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index bcd90c2..5fefe49 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -138,13 +138,21 @@ class BaseCalendarConverter(BaseDateConverter): name: str = "Base Calendar Converter" def min_month(self) -> int: - """First month for this calendar.""" + """Smallest numeric month for this calendar.""" raise NotImplementedError - def max_month(self) -> int: - """Last month for this calendar.""" + def max_month(self, year: int) -> int: + """Maximum numeric month for this calendar""" raise NotImplementedError + def first_month(self) -> int: + """first month in this calendar; by default, returns :meth:`min_month`.""" + return self.min_month() + + def last_month(self, year: int) -> int: + """last month in this calendar; by default, returns :meth:`max_month`.""" + return self.max_month(year) + def max_day(self, year: int, month: int) -> int: """maximum numeric day for the specified year and month in this calendar""" raise NotImplementedError diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py index af8ea25..59cde48 100644 --- a/src/undate/converters/calendars/gregorian.py +++ b/src/undate/converters/calendars/gregorian.py @@ -19,7 +19,7 @@ def min_month(self) -> int: """First month for the Gregorian calendar.""" return 1 - def max_month(self) -> int: + def max_month(self, year: int) -> int: """maximum numeric month for the specified year in the Gregorian calendar""" return 12 diff --git a/src/undate/converters/calendars/hebrew/converter.py b/src/undate/converters/calendars/hebrew/converter.py index 7d83dc7..b8b4620 100644 --- a/src/undate/converters/calendars/hebrew/converter.py +++ b/src/undate/converters/calendars/hebrew/converter.py @@ -25,14 +25,21 @@ def __init__(self): self.transformer = HebrewDateTransformer() def min_month(self) -> int: - """first numeric month for the specified year in this calendar""" - # hebrew calendar civil year starts in Tishri + """Smallest numeric month for this calendar.""" + return 1 + + def max_month(self, year: int) -> int: + """Maximum numeric month for this calendar. In Hebrew calendar, this is 12 or 13 + depending on whether it is a leap year.""" + return hebrew.year_months(year) + + def first_month(self) -> int: + """First month in this calendar. The Hebrew civil year starts in Tishri.""" return hebrew.TISHRI - def max_month(self) -> int: - """last numeric month for the specified year in this calendar""" - # hebrew calendar civil year starts in Tishri - # Elul is the month before Tishri + def last_month(self, year: int) -> int: + """Last month in this calendar. Hebrew civil year starts in Tishri, + Elul is the month before Tishri.""" return hebrew.ELUL def max_day(self, year: int, month: int) -> int: diff --git a/src/undate/converters/calendars/hijri/converter.py b/src/undate/converters/calendars/hijri/converter.py index 1cb7c82..b4b81b1 100644 --- a/src/undate/converters/calendars/hijri/converter.py +++ b/src/undate/converters/calendars/hijri/converter.py @@ -29,11 +29,11 @@ def max_day(self, year: int, month: int) -> int: return islamic.month_length(year, month) def min_month(self) -> int: - """First month for this calendar.""" + """smallest numeric month for this calendar.""" return 1 - def max_month(self) -> int: - """maximum numeric month for the specified year in this calendar""" + def max_month(self, year: int) -> int: + """maximum numeric month for this calendar""" return 12 def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: diff --git a/src/undate/undate.py b/src/undate/undate.py index 0c635c0..fab277c 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -124,24 +124,24 @@ def calculate_earliest_latest(self, year, month, day): if month == "XX": month = None - # get first and last month from the calendar, since it is not - # always 1 and 12 - # TODO need to differentiate between min/max and first/last! + # get first and last month from the calendar (not always 1 and 12) + # as well as min/max months + earliest_month = self.calendar_converter.first_month() + latest_month = self.calendar_converter.last_month(max_year) + min_month = self.calendar_converter.min_month() - max_month = self.calendar_converter.max_month() + max_month = self.calendar_converter.max_month(max_year) if month is not None: try: # treat as an integer if we can month = int(month) # update initial value self.initial_values["month"] = month - min_month = max_month = month + earliest_month = latest_month = month except ValueError: # if not, calculate min/max for missing digits - min_month, max_month = self._missing_digit_minmax( - str(month), - 1, - 12, # min_month, max_month + earliest_month, latest_month = self._missing_digit_minmax( + str(month), min_month, max_month ) # similar to month above — unknown day, but day-level granularity if day == "XX": @@ -159,7 +159,7 @@ def calculate_earliest_latest(self, year, month, day): rel_year = year if year and isinstance(year, int) else None # use month if it is an integer; otherwise use previusly determined # max month (which may not be 12 depending if partially unknown) - rel_month = month if month and isinstance(month, int) else max_month + rel_month = month if month and isinstance(month, int) else latest_month max_day = self.calendar_converter.max_day(rel_year, rel_month) @@ -175,10 +175,10 @@ def calculate_earliest_latest(self, year, month, day): # convert to Gregorian calendar so earliest/latest can always # be used for comparison self.earliest = Date( - *self.calendar_converter.to_gregorian(min_year, min_month, min_day) + *self.calendar_converter.to_gregorian(min_year, earliest_month, min_day) ) self.latest = Date( - *self.calendar_converter.to_gregorian(max_year, max_month, max_day) + *self.calendar_converter.to_gregorian(max_year, latest_month, max_day) ) def set_calendar(self, calendar: Union[str, Calendar]): diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py index 319b551..c3c8b7c 100644 --- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py @@ -84,11 +84,13 @@ def test_partially_known(self): unknown_month = HebrewUndate(1243, "XX") assert unknown_month.precision == DatePrecision.MONTH assert unknown_month.earliest == Date( - *converter.to_gregorian(1243, converter.min_month(), 1) + *converter.to_gregorian(1243, converter.first_month(), 1) ) - max_month = converter.max_month() + last_month = converter.last_month(year=1243) assert unknown_month.latest == Date( - *converter.to_gregorian(1243, max_month, converter.max_day(1243, max_month)) + *converter.to_gregorian( + 1243, last_month, converter.max_day(1243, last_month) + ) ) partially_unknown_month = HebrewUndate(1243, "1X") @@ -96,8 +98,12 @@ def test_partially_known(self): assert partially_unknown_month.earliest == Date( *converter.to_gregorian(1243, 10, 1) ) + # for unknown digit, assume largest possible value instead + # of last semantic monthin the year + last_month = converter.max_month(year=1243) + last_day = converter.max_day(1243, last_month) assert partially_unknown_month.latest == Date( - *converter.to_gregorian(1243, 12, 30) + *converter.to_gregorian(1243, last_month, last_day) ) # second month has 29 days From b7ae594016d2e73390745cef6e394f5d89837053 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Sat, 7 Dec 2024 12:08:33 +0100 Subject: [PATCH 33/36] Rewrite gregorian calendar docstring that incorrectly ref'ed Hijri --- src/undate/converters/calendars/gregorian.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py index 59cde48..63a3dd9 100644 --- a/src/undate/converters/calendars/gregorian.py +++ b/src/undate/converters/calendars/gregorian.py @@ -42,8 +42,9 @@ def max_day(self, year: int, month: int) -> int: return max_day def to_gregorian(self, year, month, day) -> tuple[int, int, int]: - """Convert a Hijri date, specified by year, month, and day, - to the Gregorian equivalent date. Returns a tuple of year, month, day. + """Convert to Gregorian date. This returns the specified by year, month, + and day unchanged, but is provided for consistency since all calendar + converters need to support conversion to Gregorian calendar for + a common point of comparison. """ - return (year, month, day) From 759d0c7b4b471d004abd08d0f3ed76674bbfc2db Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Sat, 7 Dec 2024 12:21:19 +0100 Subject: [PATCH 34/36] Fix docstring typo caught by @coderabbitai --- src/undate/converters/calendars/gregorian.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian.py index 63a3dd9..5a1d2dc 100644 --- a/src/undate/converters/calendars/gregorian.py +++ b/src/undate/converters/calendars/gregorian.py @@ -5,11 +5,12 @@ class GregorianDateConverter(BaseCalendarConverter): """ - Calendar onverter class for Gregorian calendar. + Calendar converter class for Gregorian calendar. """ #: converter name: Gregorian name: str = "Gregorian" + #: calendar calendar_name: str = "Gregorian" #: known non-leap year From d9fd4ba39cf6dc7f6c6032121f8f606d122c18a9 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 20 Dec 2024 15:30:14 -0500 Subject: [PATCH 35/36] Include calendar converters in sphinx docs and add basic usage to readme --- README.md | 29 ++++++++++++++++++++++++++++- docs/undate/converters.rst | 35 +++++++++++++++++++++++++++++------ 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 37b8452..9c8e898 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ An `UndateInterval` is a date range between two `Undate` objects. Intervals can ``` You can initialize `Undate` or `UndateInterval` objects by parsing a date string with a specific converter, and you can also output an `Undate` object in those formats. -Available converters are "ISO8601" and "EDTF" (but only) +Currently available converters are "ISO8601" and "EDTF" and supported calendars. ```python >>> from undate import Undate @@ -156,6 +156,33 @@ Available converters are "ISO8601" and "EDTF" (but only) ``` +### Calendars + +All `Undate` objects are calendar aware, and date converters include support for parsing and working with dates from other calendars. The Gregorian calendar is used by default; currently `undate` supports the Hijri Islamic calendar and the Anno Mundi Hebrew calendar based on calendar convertion logic implemented in the [convertdate](https://convertdate.readthedocs.io/en/latest/)package. + +Dates are stored with the year, month, day and appropriate precision for the original calendar; internally, earliest and latest dates are calculated in Gregorian / Proleptic Gregorian calendar for standardized comparison across dates from different calendars. + +```python +>>> from undate import Undate +>>> tammuz4816 = Undate.parse("26 Tammuz 4816", "Hebrew") +>>> tammuz4816 + +>>> rajab495 = Undate.parse("Rajab 495", "Hijri") +>>> rajab495 + +>>> y2k = Undate.parse("2001", "EDTF") +>>> y2k + +>>> [str(d.earliest) for d in [rajab495, tammuz4816, y2k]] +['1102-04-28', '1056-07-17', '2001-01-01'] +>>> [str(d.precision) for d in [rajab495, tammuz4816, y2k]] +['MONTH', 'DAY', 'YEAR'] +>>> sorted([rajab495, tammuz4816, y2k]) +[, , ] +``` + +* * * + For more examples, refer to the [example notebooks](https://github.com/dh-tech/undate-python/tree/main/examples/notebooks/) included in this repository. ## Documentation diff --git a/docs/undate/converters.rst b/docs/undate/converters.rst index 701aaf1..57e90a1 100644 --- a/docs/undate/converters.rst +++ b/docs/undate/converters.rst @@ -1,19 +1,25 @@ Converters ========== +Overview +-------- + .. automodule:: undate.converters.base :members: :undoc-members: +Formats +-------- + ISO8601 -------- +^^^^^^^ .. automodule:: undate.converters.iso8601 :members: :undoc-members: Extended Date-Time Format (EDTF) --------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. automodule:: undate.converters.edtf.converter :members: @@ -23,8 +29,25 @@ Extended Date-Time Format (EDTF) :members: :undoc-members: -.. transformer is more of an internal, probably doesn't make sense to include -.. .. automodule:: undate.converters.edtf.transformer -.. :members: -.. :undoc-members: + +Calendars +--------- + +Gregorian +^^^^^^^^^ + +.. automodule:: undate.converters.calendars.gregorian + :members: + +Hijri (Islamic calendar) +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: undate.converters.calendars.hijri.converter + :members: + +Anno Mundi (Hebrew calendar) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: undate.converters.calendars.hebrew.converter + :members: From 4372b237e78d0491eaf1328608817f62fd83aaeb Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 20 Dec 2024 15:40:58 -0500 Subject: [PATCH 36/36] Address coverage issues flagged by codecov --- src/undate/converters/edtf/transformer.py | 11 +++-------- tests/test_converters/test_base.py | 14 +++++++++++++- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/undate/converters/edtf/transformer.py b/src/undate/converters/edtf/transformer.py index 135c93b..d5bcfcb 100644 --- a/src/undate/converters/edtf/transformer.py +++ b/src/undate/converters/edtf/transformer.py @@ -54,24 +54,19 @@ def year_unspecified(self, items): return Tree(data="year", children=[value]) def month_unspecified(self, items): + # combine multiple parts into a single string value = "".join(self.get_values(items)) return Tree(data="month", children=[value]) def day_unspecified(self, items): + # combine multiple parts into a single string value = "".join(self.get_values(items)) return Tree(data="day", children=[value]) def date_level1(self, items): return self.date(items) - def year(self, items): - # when the year is negative, there are two tokens - if len(items) > 1 and items[0] == "-": - # an anonymous token for the - and the integer year - year = items[1] - return Tree(data="year", children=[-year]) - - return Tree(data="year", children=[items[0]]) + # year (including negative years) use default transformation def year_fivedigitsplus(self, items): # strip off the leading Y and convert to integer diff --git a/tests/test_converters/test_base.py b/tests/test_converters/test_base.py index 1426f13..c9578e4 100644 --- a/tests/test_converters/test_base.py +++ b/tests/test_converters/test_base.py @@ -1,7 +1,7 @@ import logging import pytest -from undate.converters.base import BaseDateConverter +from undate.converters.base import BaseDateConverter, BaseCalendarConverter class TestBaseDateConverter: @@ -62,3 +62,15 @@ class ISO8601DateFormat2(BaseDateConverter): assert len(BaseDateConverter.available_converters()) != len( BaseDateConverter.subclasses() ) + + +class TestBaseCalendarConverter: + def test_not_implemented(self): + with pytest.raises(NotImplementedError): + BaseCalendarConverter().min_month() + with pytest.raises(NotImplementedError): + BaseCalendarConverter().max_month(1900) + with pytest.raises(NotImplementedError): + BaseCalendarConverter().max_day(1900, 12) + with pytest.raises(NotImplementedError): + BaseCalendarConverter().to_gregorian(1900, 12, 31)