From 269970c519e41a0a05eb5564d25df2ccbfe0b33b Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 10 Nov 2022 16:18:44 -0500 Subject: [PATCH 01/13] Support initializing undate with partially known year or month Co-authored-by: Cole Crawford <16374762+ColeDCrawford@users.noreply.github.com> --- tests/test_undate.py | 43 +++++++++++++++++++++++ undate/undate.py | 83 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 110 insertions(+), 16 deletions(-) diff --git a/tests/test_undate.py b/tests/test_undate.py index 628d7b5..a338d45 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -12,6 +12,47 @@ def test_str(self): assert str(Undate(2022)) == "2022" assert str(Undate(month=11, day=7)) == "--11-07" + def test_init_str(self): + assert Undate("2000").earliest.year == 2000 + # single or double digit string month should be ok + assert Undate("2000", "2").earliest.month == 2 + assert Undate("2000", "02").earliest.month == 2 + + def test_init_partially_known_year(self): + uncertain1900s = Undate("19XX") + assert uncertain1900s.earliest.year == 1900 + assert uncertain1900s.latest.year == 1999 + + uncertain1x = Undate("1X05") + assert uncertain1x.earliest.year == 1005 + assert uncertain1x.latest.year == 1905 + + uncertain18x7 = Undate("18X7") + assert uncertain18x7.earliest.year == 1807 + assert uncertain18x7.latest.year == 1897 + + def test_init_partially_known_month(self): + uncertain_fall = Undate(1900, "1X") + assert uncertain_fall.earliest.month == 10 + assert uncertain_fall.latest.month == 12 + + uncertain_notfall = Undate(1900, "0X") + assert uncertain_notfall.earliest.month == 1 + assert uncertain_notfall.latest.month == 9 + + # treat as unknown but allow + unknown_month = Undate(1900, "XX") + assert unknown_month.earliest.month == 1 + assert unknown_month.latest.month == 12 + assert str(unknown_month) == "1900" # NOT 1900-XX ? + + def test_init_invalid(self): + with pytest.raises(ValueError): + Undate("19xx") + + with pytest.raises(ValueError): + Undate(1900, "X1") + def test_invalid_date(self): # invalid month should raise an error with pytest.raises(ValueError): @@ -52,6 +93,8 @@ def test_duration(self): def test_known_year(self): assert Undate(2022).known_year is True assert Undate(month=2, day=5).known_year is False + # partially known year is not known + assert Undate("19XX").known_year is False class TestUndateInterval: diff --git a/undate/undate.py b/undate/undate.py index 544a1c9..0e4c7c7 100644 --- a/undate/undate.py +++ b/undate/undate.py @@ -18,6 +18,9 @@ class Undate: DEFAULT_FORMAT = "ISO8601" + #: symbol for unknown digits within a date value + MISSING_DIGIT = "X" + earliest: Union[datetime.date, None] = None latest: Union[datetime.date, None] = None label: Union[str, None] = None @@ -25,8 +28,8 @@ class Undate: def __init__( self, - year: Optional[int] = None, - month: Optional[int] = None, + year: Optional[Union[int, str]] = None, + month: Optional[Union[int, str]] = None, day: Optional[int] = None, formatter: Optional[BaseDateFormat] = None, ): @@ -34,29 +37,73 @@ def __init__( # e.g., maybe values could be string or int; if string with # unknown digits, calculate min/max for unknowns + # keep track of initial values and which values are known + self.initial_values: Dict[str, Union[int, str]] = { + "year": year, + "month": month, + "day": day, + } + + if year is not None: + try: + year = int(year) + min_year = max_year = year + except ValueError: + # year is a string that can't be converted to int + min_year = int(year.replace(self.MISSING_DIGIT, "0")) + max_year = int(year.replace(self.MISSING_DIGIT, "9")) + else: + min_year = datetime.MINYEAR + max_year = datetime.MAXYEAR + + # if month is passed in as a string but completely unknown, + # treat as none + # TODO: we should preserve this information somehow; + # difference between just a year and and an unknown month within a year + # maybe in terms of granularity / size ? + if month == "XX": + month = None + + if month is not None: + try: + month = int(month) + min_month = max_month = month + except ValueError: + min_month = max_month = None + if len(month) == 2: + # if two digit month is 1x, range is 10 - 12 + if month[0] == "1": + min_month = int(month.replace(self.MISSING_DIGIT, "0")) + max_month = int(month.replace(self.MISSING_DIGIT, "2")) + # if two digit month is 0x, range is 01 - 09 + elif month[0] == "0": + min_month = int(month.replace(self.MISSING_DIGIT, "1")) + max_month = int(month.replace(self.MISSING_DIGIT, "9")) + + # are these possible/plausible ? X1 X2 + # assuming not + if not min_month and not max_month: + raise ValueError + + else: + min_month = 1 + max_month = 12 + # for unknowns, assume smallest possible value for earliest and # largest valid for latest - self.earliest = datetime.date(year or datetime.MINYEAR, month or 1, day or 1) + self.earliest = datetime.date(min_year, min_month, day or 1) # if day is unknown but we have year and month, calculate max day if day is None and year and month: - _, maxday = monthrange(year, month) + _, maxday = monthrange(year, max_month) elif day is None and year is None and month: # TODO: what to do if we don't have year and month? # This will produce bad data if the year is a leap year and the month is February # 2022 chosen below as it is not a not leap year # Better than just setting 31, but still not great - _, maxday = monthrange(2022, month) + _, maxday = monthrange(2022, max_month) else: maxday: int = 31 - self.latest = datetime.date( - year or datetime.MAXYEAR, month or 12, day or maxday - ) - # keep track of which values are known - self.known_values: Dict[str, bool] = { - "year": year is not None, - "month": month is not None, - "day": day is not None, - } + self.latest = datetime.date(max_year, max_month, day or maxday) if not formatter: # TODO subclass definitions not available unless they are imported where Undate() is called @@ -75,12 +122,16 @@ def __eq__(self, other: "Undate") -> bool: return ( self.earliest == other.earliest and self.latest == other.latest - and self.known_values == other.known_values + # NOTE: assumes that partially known values can only be written + # in one format (i.e. X for missing digits). + # If we support other formats, may need to normalize to common + # internal format for comparison + and self.initial_values == other.initial_values ) @property def known_year(self) -> bool: - return self.known_values["year"] + return isinstance(self.initial_values["year"], int) def duration(self) -> datetime.timedelta: # what is the duration of this date? From b438e6e9d1dbbad62b71454af74c9c3f66ee6f4b Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 10 Nov 2022 16:40:11 -0500 Subject: [PATCH 02/13] Handle some cases for initializing undate partially known days Co-authored-by: Cole Crawford <16374762+ColeDCrawford@users.noreply.github.com> --- tests/test_undate.py | 41 ++++++++++++++++++++++++++++ undate/undate.py | 65 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 92 insertions(+), 14 deletions(-) diff --git a/tests/test_undate.py b/tests/test_undate.py index a338d45..a8450c4 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -46,6 +46,47 @@ def test_init_partially_known_month(self): assert unknown_month.latest.month == 12 assert str(unknown_month) == "1900" # NOT 1900-XX ? + def test_init_partially_known_day(self): + uncertain_day = Undate(1900, 1, "XX") # treat as None + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 31 + + uncertain_day = Undate(1900, 1, "1X") + assert uncertain_day.earliest.day == 10 + assert uncertain_day.latest.day == 19 + + uncertain_day = Undate(1900, 1, "0X") + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 9 + uncertain_day = Undate(1900, 1, "2X") + assert uncertain_day.earliest.day == 20 + assert uncertain_day.latest.day == 29 + uncertain_day = Undate(1900, 1, "3X") + assert uncertain_day.earliest.day == 30 + assert uncertain_day.latest.day == 31 + + uncertain_day = Undate(1900, 1, "X5") + assert uncertain_day.earliest.day == 5 + assert uncertain_day.latest.day == 25 + + uncertain_day = Undate(1900, 1, "X1") + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 31 + + # TODO: handle months with only 30 days + + # month with only 30 days + uncertain_day = Undate(1900, 6, "X1") + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 30 + uncertain_day = Undate(1900, 6, "3X") + assert uncertain_day.earliest.day == 30 + assert uncertain_day.latest.day == 30 + + # special cases + # february! TODO + # uncertain_day = Undate(1900, 2, "2X") + def test_init_invalid(self): with pytest.raises(ValueError): Undate("19xx") diff --git a/undate/undate.py b/undate/undate.py index 0e4c7c7..71d41df 100644 --- a/undate/undate.py +++ b/undate/undate.py @@ -30,7 +30,7 @@ def __init__( self, year: Optional[Union[int, str]] = None, month: Optional[Union[int, str]] = None, - day: Optional[int] = None, + day: Optional[Union[int, str]] = None, formatter: Optional[BaseDateFormat] = None, ): # TODO: support initializing for unknown values in each of these @@ -89,21 +89,58 @@ def __init__( min_month = 1 max_month = 12 + # similar to month above — unknown day, but day-level granularity + if day == "XX": + day = None + + if day is not None: + try: + day = int(day) + min_day = max_day = day + except ValueError: + min_day = max_day = None + if len(day) == 2: + # special case since most months only go up to 30/31 + if day[0] == "3": + min_day = int(day.replace(self.MISSING_DIGIT, "0")) + # TODO: possibly max is 0 depending on the month + max_day = int(day.replace(self.MISSING_DIGIT, "1")) + # if second digit is missing, e.g. 1X or 2X + elif day[1] == self.MISSING_DIGIT: + if day[0] == "0": + # can't have 00 + min_day = int(day.replace(self.MISSING_DIGIT, "1")) + else: + min_day = int(day.replace(self.MISSING_DIGIT, "0")) + max_day = int(day.replace(self.MISSING_DIGIT, "9")) + # if first digit is missing + elif day[0] == self.MISSING_DIGIT: + min_day = int(day.replace(self.MISSING_DIGIT, "0")) + if int(day[1]) > 1: + max_day = int(day.replace(self.MISSING_DIGIT, "2")) + else: + max_day = int(day.replace(self.MISSING_DIGIT, "3")) + + else: + min_day = 1 + + # if we know year and month (or max month), calculate exactly + if year and month: + _, max_day = monthrange(year, max_month) + + elif year is None and month: + # TODO: what to do if we don't have year and month? + # This will produce bad data if the year is a leap year and the month is February + # 2022 chosen below as it is not a not leap year + # Better than just setting 31, but still not great + _, max_day = monthrange(2022, max_month) + else: + max_day: int = 31 + # for unknowns, assume smallest possible value for earliest and # largest valid for latest - self.earliest = datetime.date(min_year, min_month, day or 1) - # if day is unknown but we have year and month, calculate max day - if day is None and year and month: - _, maxday = monthrange(year, max_month) - elif day is None and year is None and month: - # TODO: what to do if we don't have year and month? - # This will produce bad data if the year is a leap year and the month is February - # 2022 chosen below as it is not a not leap year - # Better than just setting 31, but still not great - _, maxday = monthrange(2022, max_month) - else: - maxday: int = 31 - self.latest = datetime.date(max_year, max_month, day or maxday) + self.earliest = datetime.date(min_year, min_month, min_day) + self.latest = datetime.date(max_year, max_month, max_day) if not formatter: # TODO subclass definitions not available unless they are imported where Undate() is called From 69e1188ef6e83cc1896871f65c873198ec246ca5 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 10 Nov 2022 16:45:58 -0500 Subject: [PATCH 03/13] Add notes about next steps Co-authored-by: Cole Crawford <16374762+ColeDCrawford@users.noreply.github.com> --- undate/undate.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/undate/undate.py b/undate/undate.py index 71d41df..39814f8 100644 --- a/undate/undate.py +++ b/undate/undate.py @@ -44,6 +44,8 @@ def __init__( "day": day, } + # TODO: refactor partial date min/max calculations + if year is not None: try: year = int(year) @@ -173,6 +175,14 @@ def known_year(self) -> bool: def duration(self) -> datetime.timedelta: # what is the duration of this date? # subtract earliest from latest, and add a day to count the starting day + + # TODO: update to account for partially known values; + # can it be based on known granularity somehow? + # 1900-11-2X => one day + # 1900-1X => one month ? (30? 31?) + # maybe go with the maximum possible value? + # if granularity == month but not known month, duration = 31 + return self.latest - self.earliest + ONE_DAY From c34f75c91e7d06a4b306b086bad75310d916e7a7 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 10 Nov 2022 17:07:11 -0500 Subject: [PATCH 04/13] Clean up known year logic; disable failing tests --- tests/test_undate.py | 20 ++++++++++++++++++++ undate/dateformat/iso8601.py | 8 +++----- undate/undate.py | 18 +++++++++++++++++- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/tests/test_undate.py b/tests/test_undate.py index a8450c4..24c667d 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -73,6 +73,10 @@ def test_init_partially_known_day(self): assert uncertain_day.earliest.day == 1 assert uncertain_day.latest.day == 31 + return + + # we don't support these cases yet but need to + # TODO: handle months with only 30 days # month with only 30 days @@ -136,6 +140,22 @@ def test_known_year(self): assert Undate(month=2, day=5).known_year is False # partially known year is not known assert Undate("19XX").known_year is False + # fully known string year should be known + assert Undate("1900").known_year is True + + def test_is_known_month(self): + assert Undate(2022).is_known("month") is False + assert Undate(2022, 2).is_known("month") is True + assert Undate(2022, "5").is_known("month") is True + assert Undate(2022, "1X").is_known("month") is False + assert Undate(2022, "XX").is_known("month") is False + + def test_is_known_day(self): + assert Undate(1984).is_known("day") is False + assert Undate(month=1, day=3).is_known("day") is True + assert Undate(month=1, day="5").is_known("day") is True + assert Undate(month=1, day="X5").is_known("day") is False + assert Undate(month=1, day="XX").is_known("day") is False class TestUndateInterval: diff --git a/undate/dateformat/iso8601.py b/undate/dateformat/iso8601.py index 5bf9896..0854faa 100644 --- a/undate/dateformat/iso8601.py +++ b/undate/dateformat/iso8601.py @@ -49,17 +49,15 @@ def to_string(self, undate: Undate) -> str: date_parts: List[Union[str, None]] = [] # for each part of the date that is known, generate the string format # then combine - for date_portion, known in undate.known_values.items(): - if known: + for date_portion, iso_format in self.iso_format.items(): + if undate.is_known(date_portion): # NOTE: datetime strftime for %Y for 3-digit year # results in leading zero in some environments # and not others; force year to always be 4 digits if date_portion == "year": date_parts.append("%04d" % undate.earliest.year) else: - date_parts.append( - undate.earliest.strftime(self.iso_format[date_portion]) - ) + date_parts.append(undate.earliest.strftime(iso_format)) elif date_portion == "year": # if not known but this is year, add '-' for --MM-DD unknown year format date_parts.append("-") diff --git a/undate/undate.py b/undate/undate.py index 39814f8..6082e47 100644 --- a/undate/undate.py +++ b/undate/undate.py @@ -49,6 +49,9 @@ def __init__( if year is not None: try: year = int(year) + # update initial value since it is used to determine + # whether or not year is known + self.initial_values["year"] = year min_year = max_year = year except ValueError: # year is a string that can't be converted to int @@ -69,6 +72,8 @@ def __init__( if month is not None: try: month = int(month) + # update initial value + self.initial_values["month"] = month min_month = max_month = month except ValueError: min_month = max_month = None @@ -98,6 +103,8 @@ def __init__( if day is not None: try: day = int(day) + # update initial value + self.initial_values["day"] = day min_day = max_day = day except ValueError: min_day = max_day = None @@ -170,7 +177,16 @@ def __eq__(self, other: "Undate") -> bool: @property def known_year(self) -> bool: - return isinstance(self.initial_values["year"], int) + return self.is_known("year") + + def is_known(self, part: str) -> bool: + """Check if a part of the date (year, month, day) is known. + Returns False if unknown or only partially known.""" + # TODO: should we use constants or enum for values? + + # if we have an integer, then consider the date known + # if we have a string, then it is only partially known; return false + return isinstance(self.initial_values[part], int) def duration(self) -> datetime.timedelta: # what is the duration of this date? From 410196978823e339f91d6822b6cd69e87348bd5e Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 10 Nov 2022 17:43:17 -0500 Subject: [PATCH 05/13] Refactor logic for calculating missing month digits --- tests/test_undate.py | 7 ++++--- undate/undate.py | 48 +++++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/tests/test_undate.py b/tests/test_undate.py index 24c667d..abdd0bd 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -40,6 +40,10 @@ def test_init_partially_known_month(self): assert uncertain_notfall.earliest.month == 1 assert uncertain_notfall.latest.month == 9 + # unlikely case, but now possible to calculate + assert Undate(1900, "X1").earliest.month == 1 + assert Undate(1900, "X1").latest.month == 11 + # treat as unknown but allow unknown_month = Undate(1900, "XX") assert unknown_month.earliest.month == 1 @@ -95,9 +99,6 @@ def test_init_invalid(self): with pytest.raises(ValueError): Undate("19xx") - with pytest.raises(ValueError): - Undate(1900, "X1") - def test_invalid_date(self): # invalid month should raise an error with pytest.raises(ValueError): diff --git a/undate/undate.py b/undate/undate.py index 6082e47..ae795ea 100644 --- a/undate/undate.py +++ b/undate/undate.py @@ -1,5 +1,6 @@ import datetime from calendar import monthrange +import re # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Optional, Dict, Union @@ -33,10 +34,6 @@ def __init__( day: Optional[Union[int, str]] = None, formatter: Optional[BaseDateFormat] = None, ): - # TODO: support initializing for unknown values in each of these - # e.g., maybe values could be string or int; if string with - # unknown digits, calculate min/max for unknowns - # keep track of initial values and which values are known self.initial_values: Dict[str, Union[int, str]] = { "year": year, @@ -69,32 +66,37 @@ def __init__( if month == "XX": month = None + min_month = 1 + max_month = 12 if month is not None: try: + # treat as an integer if we can month = int(month) # update initial value self.initial_values["month"] = month min_month = max_month = month except ValueError: - min_month = max_month = None - if len(month) == 2: - # if two digit month is 1x, range is 10 - 12 - if month[0] == "1": - min_month = int(month.replace(self.MISSING_DIGIT, "0")) - max_month = int(month.replace(self.MISSING_DIGIT, "2")) - # if two digit month is 0x, range is 01 - 09 - elif month[0] == "0": - min_month = int(month.replace(self.MISSING_DIGIT, "1")) - max_month = int(month.replace(self.MISSING_DIGIT, "9")) - - # are these possible/plausible ? X1 X2 - # assuming not - if not min_month and not max_month: - raise ValueError - - else: - min_month = 1 - max_month = 12 + # if not, calculate min/max for missing digits + # determine the range of possible values + possible_values = [f"{n:02}" for n in range(min_month, max_month + 1)] + # generate regex where missing digit matches anything + month_pattern = re.compile(month.replace(self.MISSING_DIGIT, ".")) + # identify all possible matches, then get min and max + matches = [val for val in possible_values if month_pattern.match(val)] + min_match = min(matches) + max_match = max(matches) + + # split input month string into a list so we can update digits + min_month = list(month) + max_month = list(month) + for i, digit in enumerate(month): + if digit == self.MISSING_DIGIT: + min_month[i] = min_match[i] + max_month[i] = max_match[i] + + # combine the lists of digits back together and convert to int + min_month = int("".join(min_month)) + max_month = int("".join(max_month)) # similar to month above — unknown day, but day-level granularity if day == "XX": From bd53ce5e8e59c9b8a6b3729770cf5177ef26c1aa Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 10 Nov 2022 17:53:50 -0500 Subject: [PATCH 06/13] Refactor logic for calculating missing day digits --- tests/test_undate.py | 17 ++++++------ undate/undate.py | 63 ++++++++++++++++++++++---------------------- 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/tests/test_undate.py b/tests/test_undate.py index abdd0bd..1746efb 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -77,23 +77,22 @@ def test_init_partially_known_day(self): assert uncertain_day.earliest.day == 1 assert uncertain_day.latest.day == 31 - return - - # we don't support these cases yet but need to - - # TODO: handle months with only 30 days - # month with only 30 days uncertain_day = Undate(1900, 6, "X1") assert uncertain_day.earliest.day == 1 - assert uncertain_day.latest.day == 30 + assert uncertain_day.latest.day == 21 # doesn't go to 31 uncertain_day = Undate(1900, 6, "3X") assert uncertain_day.earliest.day == 30 assert uncertain_day.latest.day == 30 # special cases - # february! TODO - # uncertain_day = Undate(1900, 2, "2X") + # february! 28 days usually + uncertain_day = Undate(1900, 2, "2X") + assert uncertain_day.earliest.day == 20 + assert uncertain_day.latest.day == 28 + # february in a leap year + uncertain_day = Undate(2024, 2, "2X") + assert uncertain_day.latest.day == 29 def test_init_invalid(self): with pytest.raises(ValueError): diff --git a/undate/undate.py b/undate/undate.py index ae795ea..ec49e08 100644 --- a/undate/undate.py +++ b/undate/undate.py @@ -80,6 +80,8 @@ def __init__( # determine the range of possible values possible_values = [f"{n:02}" for n in range(min_month, max_month + 1)] # generate regex where missing digit matches anything + # make sure month is two-digit string + month = "%02s" % month month_pattern = re.compile(month.replace(self.MISSING_DIGIT, ".")) # identify all possible matches, then get min and max matches = [val for val in possible_values if month_pattern.match(val)] @@ -102,43 +104,17 @@ def __init__( if day == "XX": day = None - if day is not None: - try: - day = int(day) - # update initial value - self.initial_values["day"] = day - min_day = max_day = day - except ValueError: - min_day = max_day = None - if len(day) == 2: - # special case since most months only go up to 30/31 - if day[0] == "3": - min_day = int(day.replace(self.MISSING_DIGIT, "0")) - # TODO: possibly max is 0 depending on the month - max_day = int(day.replace(self.MISSING_DIGIT, "1")) - # if second digit is missing, e.g. 1X or 2X - elif day[1] == self.MISSING_DIGIT: - if day[0] == "0": - # can't have 00 - min_day = int(day.replace(self.MISSING_DIGIT, "1")) - else: - min_day = int(day.replace(self.MISSING_DIGIT, "0")) - max_day = int(day.replace(self.MISSING_DIGIT, "9")) - # if first digit is missing - elif day[0] == self.MISSING_DIGIT: - min_day = int(day.replace(self.MISSING_DIGIT, "0")) - if int(day[1]) > 1: - max_day = int(day.replace(self.MISSING_DIGIT, "2")) - else: - max_day = int(day.replace(self.MISSING_DIGIT, "3")) - + if isinstance(day, int) or isinstance(day, str) and day.isnumeric(): + day = int(day) + # update initial value - fully known day + self.initial_values["day"] = day + min_day = max_day = day else: + # if we have no day or partial day, calculate min / max min_day = 1 - # if we know year and month (or max month), calculate exactly if year and month: _, max_day = monthrange(year, max_month) - elif year is None and month: # TODO: what to do if we don't have year and month? # This will produce bad data if the year is a leap year and the month is February @@ -148,6 +124,29 @@ def __init__( else: max_day: int = 31 + # if day is partially specified, narrow min/max further + if day is not None: + possible_values = [f"{n:02}" for n in range(min_day, max_day + 1)] + day = "%02s" % day + # generate regex where missing digit matches anything + day_pattern = re.compile(day.replace(self.MISSING_DIGIT, ".")) + # identify all possible matches, then get min and max + matches = [val for val in possible_values if day_pattern.match(val)] + min_match = min(matches) + max_match = max(matches) + + # split input string into a list so we can update digits + min_day = list(day) + max_day = list(day) + for i, digit in enumerate(day): + if digit == self.MISSING_DIGIT: + min_day[i] = min_match[i] + max_day[i] = max_match[i] + + # combine the lists of digits back together and convert to int + min_day = int("".join(min_day)) + max_day = int("".join(max_day)) + # for unknowns, assume smallest possible value for earliest and # largest valid for latest self.earliest = datetime.date(min_year, min_month, min_day) From 54ffebe8f44dcacb6ebb6ec08afbfbd9a69ea354 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 10 Nov 2022 18:07:50 -0500 Subject: [PATCH 07/13] Refactor shared partially known month/day logic for min/max values --- undate/undate.py | 93 +++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 49 deletions(-) diff --git a/undate/undate.py b/undate/undate.py index ec49e08..0694936 100644 --- a/undate/undate.py +++ b/undate/undate.py @@ -17,16 +17,19 @@ class Undate: """Simple object for representing uncertain, fuzzy or partially unknown dates""" - DEFAULT_FORMAT = "ISO8601" + DEFAULT_FORMAT: str = "ISO8601" #: symbol for unknown digits within a date value - MISSING_DIGIT = "X" + MISSING_DIGIT: str = "X" earliest: Union[datetime.date, None] = None latest: Union[datetime.date, None] = None label: Union[str, None] = None formatter: Union[BaseDateFormat, None] = None + #: known non-leap year + NON_LEAP_YEAR: int = 2022 + def __init__( self, year: Optional[Union[int, str]] = None, @@ -77,28 +80,9 @@ def __init__( min_month = max_month = month except ValueError: # if not, calculate min/max for missing digits - # determine the range of possible values - possible_values = [f"{n:02}" for n in range(min_month, max_month + 1)] - # generate regex where missing digit matches anything - # make sure month is two-digit string - month = "%02s" % month - month_pattern = re.compile(month.replace(self.MISSING_DIGIT, ".")) - # identify all possible matches, then get min and max - matches = [val for val in possible_values if month_pattern.match(val)] - min_match = min(matches) - max_match = max(matches) - - # split input month string into a list so we can update digits - min_month = list(month) - max_month = list(month) - for i, digit in enumerate(month): - if digit == self.MISSING_DIGIT: - min_month[i] = min_match[i] - max_month[i] = max_match[i] - - # combine the lists of digits back together and convert to int - min_month = int("".join(min_month)) - max_month = int("".join(max_month)) + min_month, max_month = self._missing_digit_minmax( + month, min_month, max_month + ) # similar to month above — unknown day, but day-level granularity if day == "XX": @@ -116,36 +100,16 @@ def __init__( if year and month: _, max_day = monthrange(year, max_month) elif year is None and month: - # TODO: what to do if we don't have year and month? - # This will produce bad data if the year is a leap year and the month is February - # 2022 chosen below as it is not a not leap year - # Better than just setting 31, but still not great - _, max_day = monthrange(2022, max_month) + # If we don't have year and month, + # calculate based on a known non-leap year + # (better than just setting 31, but still not great) + _, max_day = monthrange(self.NON_LEAP_YEAR, max_month) else: max_day: int = 31 # if day is partially specified, narrow min/max further if day is not None: - possible_values = [f"{n:02}" for n in range(min_day, max_day + 1)] - day = "%02s" % day - # generate regex where missing digit matches anything - day_pattern = re.compile(day.replace(self.MISSING_DIGIT, ".")) - # identify all possible matches, then get min and max - matches = [val for val in possible_values if day_pattern.match(val)] - min_match = min(matches) - max_match = max(matches) - - # split input string into a list so we can update digits - min_day = list(day) - max_day = list(day) - for i, digit in enumerate(day): - if digit == self.MISSING_DIGIT: - min_day[i] = min_match[i] - max_day[i] = max_match[i] - - # combine the lists of digits back together and convert to int - min_day = int("".join(min_day)) - max_day = int("".join(max_day)) + min_day, max_day = self._missing_digit_minmax(day, min_day, max_day) # for unknowns, assume smallest possible value for earliest and # largest valid for latest @@ -202,6 +166,37 @@ def duration(self) -> datetime.timedelta: return self.latest - self.earliest + ONE_DAY + def _missing_digit_minmax( + self, value: str, min_val: int, max_val: int + ) -> (int, int): + # given a possible range, calculate min/max values for a string + # with a missing digit + + # assuming two digit only (i.e., month or day) + possible_values = [f"{n:02}" for n in range(min_val, max_val + 1)] + # ensure input value has two digits + value = "%02s" % value + # generate regex where missing digit matches anything + val_pattern = re.compile(value.replace(self.MISSING_DIGIT, ".")) + # identify all possible matches, then get min and max + matches = [val for val in possible_values if val_pattern.match(val)] + min_match = min(matches) + max_match = max(matches) + + # split input string into a list so we can update individually + min_val = list(value) + max_val = list(value) + for i, digit in enumerate(value): + # replace the corresponding digit with our min and max + if digit == self.MISSING_DIGIT: + min_val[i] = min_match[i] + max_val[i] = max_match[i] + + # combine the lists of digits back together and convert to int + min_val = int("".join(min_val)) + max_val = int("".join(max_val)) + return (min_val, max_val) + class UndateInterval: # date range between two uncertain dates From 738f3eeb781f59dc58f873d5e6cccedfaa683960 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Wed, 8 Mar 2023 20:16:16 -0500 Subject: [PATCH 08/13] Update duration logic for partially known dates with known granularity --- src/undate/undate.py | 63 +++++++++++++++++++++++++++++++++++++------- tests/test_undate.py | 13 +++++++++ 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/src/undate/undate.py b/src/undate/undate.py index f894316..df35134 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,5 +1,6 @@ import datetime from calendar import monthrange +from enum import Enum, auto import re # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None @@ -10,10 +11,22 @@ from undate.dateformat.base import BaseDateFormat -# duration of a single day +#: duration of a single day ONE_DAY = datetime.timedelta(days=1) +class DatePrecision(Enum): + """date precision, to indicate date precision independent from how much + of the date is known.""" + + #: year + YEAR = auto() + #: month + MONTH = auto() + #: day + DAY = auto() + + class Undate: """Simple object for representing uncertain, fuzzy or partially unknown dates""" @@ -28,6 +41,8 @@ class Undate: #: Labels are not taken into account when comparing undate objects. label: Union[str, None] = None formatter: Union[BaseDateFormat, None] = None + #: precision of the date (day, month, year, etc.) + precision: DatePrecision = None #: known non-leap year NON_LEAP_YEAR: int = 2022 @@ -46,6 +61,12 @@ def __init__( "month": month, "day": day, } + if day: + self.precision = DatePrecision.DAY + elif month: + self.precision = DatePrecision.MONTH + elif year: + self.precision = DatePrecision.YEAR # TODO: refactor partial date min/max calculations @@ -142,7 +163,7 @@ def __eq__(self, other: "Undate") -> bool: and self.latest == other.latest # NOTE: assumes that partially known values can only be written # in one format (i.e. X for missing digits). - # If we support other formats, may need to normalize to common + # If we support other formats, will need to normalize to common # internal format for comparison and self.initial_values == other.initial_values ) @@ -161,16 +182,38 @@ def is_known(self, part: str) -> bool: return isinstance(self.initial_values[part], int) def duration(self) -> datetime.timedelta: - # what is the duration of this date? - # subtract earliest from latest, and add a day to count the starting day + """What is the duration of this date? + Calculate based on earliest and latest date within range, + taking into account the precision of the date even if not all + parts of the date are known.""" + + # if precision is a single day, duration is one day + # no matter when it is or what else is known + if self.precision == DatePrecision.DAY: + return ONE_DAY + + # if precision is month and year is unknown, + # calculate month duration within a single year (not min/max) + if self.precision == DatePrecision.MONTH: + latest = self.latest + if not self.known_year: + # if year is unknown, calculate month duration in + # a single year + latest = datetime.date( + self.earliest.year, self.latest.month, self.latest.day + ) + delta = latest - self.earliest + ONE_DAY + # month duration can't ever be more than 31 days + # (could we ever know if it's smaller?) + + # if granularity == month but not known month, duration = 31 + if delta.days > 31: + return datetime.timedelta(days=31) + return delta - # TODO: update to account for partially known values; - # can it be based on known granularity somehow? - # 1900-11-2X => one day - # 1900-1X => one month ? (30? 31?) - # maybe go with the maximum possible value? - # if granularity == month but not known month, duration = 31 + # otherwise, calculate based on earliest/latest range + # subtract earliest from latest and add a day to count start day return self.latest - self.earliest + ONE_DAY def _missing_digit_minmax( diff --git a/tests/test_undate.py b/tests/test_undate.py index 1ffc939..35e7dae 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -142,6 +142,19 @@ def test_duration(self): leapyear_duration = Undate(2024).duration() assert leapyear_duration.days == 366 + def test_partiallyknown_duration(self): + # day in unknown month/year + assert Undate(day=5).duration().days == 1 + assert Undate(year=1900, month=11, day="2X").duration().days == 1 + + # month in unknown year + assert Undate(month=6).duration().days == 30 + # partially known month + assert Undate(year=1900, month="1X").duration().days == 31 + # what about february? + # could vary with leap years, but assume non-leapyear + assert Undate(month=2).duration().days == 28 + def test_known_year(self): assert Undate(2022).known_year is True assert Undate(month=2, day=5).known_year is False From 6c2a77c3c43c622116dc153b472bdbbb42fa7edb Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Wed, 8 Mar 2023 20:37:45 -0500 Subject: [PATCH 09/13] Update string methods for partially known dates --- src/undate/dateformat/iso8601.py | 1 + src/undate/undate.py | 21 +++++++++++++++++++++ tests/test_undate.py | 13 ++++++++++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/undate/dateformat/iso8601.py b/src/undate/dateformat/iso8601.py index 3274fac..eb7e8eb 100644 --- a/src/undate/dateformat/iso8601.py +++ b/src/undate/dateformat/iso8601.py @@ -48,6 +48,7 @@ def to_string(self, undate: Undate) -> str: date_parts: List[Union[str, None]] = [] # for each part of the date that is known, generate the string format # then combine + # TODO: should error if we have year and day but no month for date_portion, iso_format in self.iso_format.items(): if undate.is_known(date_portion): # NOTE: datetime strftime for %Y for 3-digit year diff --git a/src/undate/undate.py b/src/undate/undate.py index df35134..d3a74d2 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -148,6 +148,24 @@ def __init__( self.label = label def __str__(self) -> str: + # if any portion of the date is partially known, construct + # pseudo ISO8601 format here, since ISO8601 doesn't support unknown digits + # (temporary, should switch to default format that can handle it, e.g. EDTF) + if any(self.is_partially_known(part) for part in ["year", "month", "day"]): + # initial values could be either string or int + year = self.initial_values["year"] + month = self.initial_values["month"] + day = self.initial_values["day"] + # if integer, convert to string with correct number of digits + # replace unknown year with - for --MM or --MM-DD format + parts = [ + f"{year:04d}" if isinstance(year, int) else year or "-", + f"{month:02d}" if isinstance(month, int) else month, + f"{day:02d}" if isinstance(day, int) else day, + ] + # combine, skipping any values that are None + return "-".join([str(p) for p in parts if p != None]) + return self.formatter.to_string(self) def __repr__(self) -> str: @@ -181,6 +199,9 @@ def is_known(self, part: str) -> bool: # if we have a string, then it is only partially known; return false return isinstance(self.initial_values[part], int) + def is_partially_known(self, part: str) -> bool: + return isinstance(self.initial_values[part], str) + def duration(self) -> datetime.timedelta: """What is the duration of this date? Calculate based on earliest and latest date within range, diff --git a/tests/test_undate.py b/tests/test_undate.py index 35e7dae..025ecd1 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -12,6 +12,17 @@ def test_str(self): assert str(Undate(2022)) == "2022" assert str(Undate(month=11, day=7)) == "--11-07" + def test_partially_known_str(self): + assert str(Undate("19XX")) == "19XX" + assert str(Undate(2022, "1X")) == "2022-1X" + assert str(Undate(2022, 11, "2X")) == "2022-11-2X" + assert str(Undate(month="1X", day=7)) == "--1X-07" + + # TODO: should not allow initializing year/day without month; + # should we infer unknown month? or raise an exception? + # assert str(Undate(2022, day="2X")) == "2022-XX-2X" # currently returns 2022-2X + # assert str(Undate(2022, day=7)) == "2022-XX-07" @ currently returns 2022-07 + def test_repr(self): assert repr(Undate(2022, 11, 7)) == "" assert ( @@ -55,7 +66,7 @@ def test_init_partially_known_month(self): unknown_month = Undate(1900, "XX") assert unknown_month.earliest.month == 1 assert unknown_month.latest.month == 12 - assert str(unknown_month) == "1900" # NOT 1900-XX ? + assert str(unknown_month) == "1900-XX" def test_init_partially_known_day(self): uncertain_day = Undate(1900, 1, "XX") # treat as None From a37377f8cbc133b745e7370a44d8cc6ec3db8bf1 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 14 Jul 2023 08:43:24 -0400 Subject: [PATCH 10/13] Adjust partial date duration calculation for dates that wrap years --- src/undate/undate.py | 9 +++++++-- tests/test_undate.py | 7 +++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/undate/undate.py b/src/undate/undate.py index d3a74d2..96dd346 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -322,13 +322,18 @@ def duration(self) -> datetime.timedelta: elif not self.latest.known_year and not self.earliest.known_year: # under what circumstances can we assume that if both years # are unknown the dates are in the same year or sequential? - duration = self.latest.earliest - self.earliest.earliest + ONE_DAY + duration = self.latest.earliest - self.earliest.earliest # if we get a negative, we've wrapped from end of one year - # to the beginning of the next + # to the beginning of the next; + # recalculate assuming second date is in the subsequent year if duration.days < 0: end = self.latest.earliest + relativedelta(years=1) duration = end - self.earliest.earliest + # add the additional day *after* checking for a negative + else: + duration += ONE_DAY + return duration else: diff --git a/tests/test_undate.py b/tests/test_undate.py index 025ecd1..7293553 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -265,3 +265,10 @@ def test_duration(self): Undate(None, 12, 1), Undate(None, 1, 1) ).duration() assert month_noyear_duration.days == 31 + + # real case from Shakespeare and Company Project data; + # second date is a year minus one day in the future + month_noyear_duration = UndateInterval( + Undate(None, 6, 7), Undate(None, 6, 6) + ).duration() + assert month_noyear_duration.days == 364 From 615cb0d9af313128a562107305d1b7c2eddd9535 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 14 Jul 2023 08:51:12 -0400 Subject: [PATCH 11/13] Make duration logic more consistent --- src/undate/undate.py | 4 ++-- tests/test_undate.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/undate/undate.py b/src/undate/undate.py index 96dd346..e816adb 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -331,8 +331,8 @@ def duration(self) -> datetime.timedelta: duration = end - self.earliest.earliest # add the additional day *after* checking for a negative - else: - duration += ONE_DAY + # or after recalculating with adjusted year + duration += ONE_DAY return duration diff --git a/tests/test_undate.py b/tests/test_undate.py index 7293553..d02c3d2 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -264,11 +264,12 @@ def test_duration(self): month_noyear_duration = UndateInterval( Undate(None, 12, 1), Undate(None, 1, 1) ).duration() - assert month_noyear_duration.days == 31 + assert month_noyear_duration.days == 32 + # this seems wrong, but we currently count both start and dates # real case from Shakespeare and Company Project data; # second date is a year minus one day in the future month_noyear_duration = UndateInterval( Undate(None, 6, 7), Undate(None, 6, 6) ).duration() - assert month_noyear_duration.days == 364 + assert month_noyear_duration.days == 365 From c9e767313be0e812389baa88b77201f98e9406d5 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 14 Jul 2023 09:16:54 -0400 Subject: [PATCH 12/13] Add jupyter notebook comparing partial date duration logic with S&co --- .../shxco_partial_date_durations.ipynb | 5667 +++++++++++++++++ 1 file changed, 5667 insertions(+) create mode 100644 examples/notebooks/shxco_partial_date_durations.ipynb diff --git a/examples/notebooks/shxco_partial_date_durations.ipynb b/examples/notebooks/shxco_partial_date_durations.ipynb new file mode 100644 index 0000000..11d6662 --- /dev/null +++ b/examples/notebooks/shxco_partial_date_durations.ipynb @@ -0,0 +1,5667 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# undate partial date duration check\n", + "compare undate interval duration calculation for date rnages between partial dates with Shakespeare and Company Project events dataset" + ], + "metadata": { + "id": "s_holu9LI6q1" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TbokQJlu4G7Y", + "outputId": "d30849fd-811c-492d-ed37-a66ea4dd9088" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting git+https://github.com/dh-tech/undate-python.git@story/3-partially-known-dates\n", + " Cloning https://github.com/dh-tech/undate-python.git (to revision story/3-partially-known-dates) to /tmp/pip-req-build-aoklox4b\n", + " Running command git clone --filter=blob:none --quiet https://github.com/dh-tech/undate-python.git /tmp/pip-req-build-aoklox4b\n", + " Running command git checkout -b story/3-partially-known-dates --track origin/story/3-partially-known-dates\n", + " Switched to a new branch 'story/3-partially-known-dates'\n", + " Branch 'story/3-partially-known-dates' set up to track remote branch 'story/3-partially-known-dates' from 'origin'.\n", + " Resolved https://github.com/dh-tech/undate-python.git to commit 615cb0d9af313128a562107305d1b7c2eddd9535\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from undate==0.2.0.dev0) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil->undate==0.2.0.dev0) (1.16.0)\n", + "Building wheels for collected packages: undate\n", + " Building wheel for undate (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for undate: filename=undate-0.2.0.dev0-py3-none-any.whl size=15200 sha256=ca2e25447c84ad830f1e7ac31a43d67701390d594fff44637a565c3de5bb6134\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-ctzxaxcn/wheels/b0/dd/8f/69e3af2abd0249334bdcc1836876d45b86b0e1183e79b71123\n", + "Successfully built undate\n", + "Installing collected packages: undate\n", + "Successfully installed undate-0.2.0.dev0\n" + ] + } + ], + "source": [ + "# install from feature branch for now, until merged\n", + "%pip install git+https://github.com/dh-tech/undate-python.git@story/3-partially-known-dates" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "# load most recent version of S&co events dataset\n", + "\n", + "# dataspace link on S&co website is currently broken\n", + "#events_df = pd.read_csv(\"https://dataspace.princeton.edu/bitstream/88435/dsp019306t2441/2/SCoData_events_v1.2_2022-01.csv\")\n", + "# other dataset link resulting in an error; incomplete download?\n", + "# events_df = pd.read_csv(\"https://dataspace.princeton.edu/bitstream/88435/dsp019306t2441/1\")\n", + "events_df = pd.read_csv(\"https://github.com/rlskoeser/shxco-missingdata-specreading/raw/main/data/source-data/SCoData_events_v1.2_2022-01.csv\")\n", + "events_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 579 + }, + "id": "Q7KZRmj_4ySW", + "outputId": "ee3cacd7-c347-437a-ee8e-91a4086d6e88" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " events_df = pd.read_csv(\"https://github.com/rlskoeser/shxco-missingdata-specreading/raw/main/data/source-data/SCoData_events_v1.2_2022-01.csv\")\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "0 Generic 1920 NaN \n", + "1 Subscription 1921 NaN \n", + "2 Borrow 1922 1922-08-23 \n", + "3 Generic 1922 NaN \n", + "4 Subscription 1922 NaN \n", + "\n", + " member_uris member_names \\\n", + "0 https://shakespeareandco.princeton.edu/members... Raymonde Linossier \n", + "1 https://shakespeareandco.princeton.edu/members... Mme Garreta \n", + "2 https://shakespeareandco.princeton.edu/members... Mr. Rhys \n", + "3 https://shakespeareandco.princeton.edu/members... Ernest Walsh \n", + "4 https://shakespeareandco.princeton.edu/members... Mr. Lincoln \n", + "\n", + " member_sort_names subscription_price_paid subscription_deposit \\\n", + "0 Linossier, Raymonde NaN NaN \n", + "1 Garreta, Mme NaN NaN \n", + "2 Rhys, Mr. NaN NaN \n", + "3 Walsh, Ernest NaN NaN \n", + "4 Lincoln, Mr. NaN 7.0 \n", + "\n", + " subscription_duration subscription_duration_days ... \\\n", + "0 NaN NaN ... \n", + "1 NaN NaN ... \n", + "2 NaN NaN ... \n", + "3 NaN NaN ... \n", + "4 NaN NaN ... \n", + "\n", + " item_uri item_title \\\n", + "0 https://shakespeareandco.princeton.edu/books/b... Pigs Is Pigs \n", + "1 NaN NaN \n", + "2 https://shakespeareandco.princeton.edu/books/c... Typhoon \n", + "3 https://shakespeareandco.princeton.edu/books/b... The Pretty Lady \n", + "4 NaN NaN \n", + "\n", + " item_volume item_authors item_year item_notes \\\n", + "0 NaN Butler, Ellis Parker 1906.0 NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN Conrad, Joseph 1902.0 NaN \n", + "3 NaN Bennett, Arnold 1918.0 NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "0 Lending Library Card Sylvia Beach, Raymonde Linossier Lending Libra... \n", + "1 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "2 Lending Library Card Sylvia Beach, Rhys Lending Library Card, Box 4... \n", + "3 Lending Library Card Sylvia Beach, Ernest Walsh Lending Library Car... \n", + "4 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest \\\n", + "0 https://figgy.princeton.edu/concern/scanned_re... \n", + "1 NaN \n", + "2 https://figgy.princeton.edu/concern/scanned_re... \n", + "3 https://figgy.princeton.edu/concern/scanned_re... \n", + "4 NaN \n", + "\n", + " source_image \n", + "0 https://iiif.princeton.edu/loris/figgy_prod/00... \n", + "1 NaN \n", + "2 https://iiif.princeton.edu/loris/figgy_prod/67... \n", + "3 https://iiif.princeton.edu/loris/figgy_prod/af... \n", + "4 NaN \n", + "\n", + "[5 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
0Generic1920NaNhttps://shakespeareandco.princeton.edu/members...Raymonde LinossierLinossier, RaymondeNaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/b...Pigs Is PigsNaNButler, Ellis Parker1906.0NaNLending Library CardSylvia Beach, Raymonde Linossier Lending Libra...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/00...
1Subscription1921NaNhttps://shakespeareandco.princeton.edu/members...Mme GarretaGarreta, MmeNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
2Borrow19221922-08-23https://shakespeareandco.princeton.edu/members...Mr. RhysRhys, Mr.NaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/c...TyphoonNaNConrad, Joseph1902.0NaNLending Library CardSylvia Beach, Rhys Lending Library Card, Box 4...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/67...
3Generic1922NaNhttps://shakespeareandco.princeton.edu/members...Ernest WalshWalsh, ErnestNaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/b...The Pretty LadyNaNBennett, Arnold1918.0NaNLending Library CardSylvia Beach, Ernest Walsh Lending Library Car...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/af...
4Subscription1922NaNhttps://shakespeareandco.princeton.edu/members...Mr. LincolnLincoln, Mr.NaN7.0NaNNaN...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
\n", + "

5 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## method to calculate durations\n", + "\n", + "define a method to initialize undate interval from start and end date string in ISO format as used in S&co datasets\n", + "\n", + "\n", + "**Note:** that there's an off-by-one discrepancy between how we currently calculate duration in undate and in the Shakespeare and Company Project code: the S&co code counts the first day in the range but not the last (could also be thought of as counting half of start and end dates)." + ], + "metadata": { + "id": "0Y6CsfIAJoqi" + } + }, + { + "cell_type": "code", + "source": [ + "from undate.undate import UndateInterval\n", + "from undate.dateformat.iso8601 import ISO8601DateFormat\n", + "\n", + "def undate_duration(start_date, end_date):\n", + " isoformat = ISO8601DateFormat()\n", + "\n", + " unstart = isoformat.parse(start_date)\n", + " unend = isoformat.parse(end_date)\n", + " interval = UndateInterval(earliest=unstart, latest=unend)\n", + "\n", + " # subtract one here for simplicity of comparison,\n", + " # to reconcile difference between how duration logic\n", + "\n", + " return interval.duration().days - 1" + ], + "metadata": { + "id": "y_MqgrQW64uI" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## subscription events\n", + "\n", + "compare subscription events with known duration" + ], + "metadata": { + "id": "JBVWMB7lJbYB" + } + }, + { + "cell_type": "code", + "source": [ + "# identify subscription eventss with duration information\n", + "subs_duration = events_df[events_df.subscription_duration_days.notna()]\n", + "subs_duration.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "id": "c8iPHU5K58cz", + "outputId": "c0cc72ef-ed0b-4a30-d7b5-ea21ef0582c7" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "28 Subscription 1927 1928 \n", + "70 Subscription 1931 1932 \n", + "233 Subscription 1921-07 1921-08 \n", + "234 Subscription 1921-09 1922-02 \n", + "260 Subscription 1923-06 1923-10 \n", + "\n", + " member_uris \\\n", + "28 https://shakespeareandco.princeton.edu/members... \n", + "70 https://shakespeareandco.princeton.edu/members... \n", + "233 https://shakespeareandco.princeton.edu/members... \n", + "234 https://shakespeareandco.princeton.edu/members... \n", + "260 https://shakespeareandco.princeton.edu/members... \n", + "\n", + " member_names \\\n", + "28 Arthur Elliott Felkin \n", + "70 Geraldine Deknatel;William Deknatel \n", + "233 Mrs. G. S. Madam \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell \n", + "260 Victor Llona \n", + "\n", + " member_sort_names subscription_price_paid \\\n", + "28 Felkin, Arthur Elliott NaN \n", + "70 Deknatel, Geraldine;Deknatel, William NaN \n", + "233 Madam, Mrs. G. S. NaN \n", + "234 Moderwell, Anne;Moderwell, Hiram NaN \n", + "260 Llona, Victor NaN \n", + "\n", + " subscription_deposit subscription_duration subscription_duration_days \\\n", + "28 NaN 1 year 365.0 \n", + "70 NaN 1 year 365.0 \n", + "233 NaN 1 month 31.0 \n", + "234 NaN 5 months 153.0 \n", + "260 NaN 4 months 122.0 \n", + "\n", + " ... item_uri item_title item_volume item_authors item_year item_notes \\\n", + "28 ... NaN NaN NaN NaN NaN NaN \n", + "70 ... NaN NaN NaN NaN NaN NaN \n", + "233 ... NaN NaN NaN NaN NaN NaN \n", + "234 ... NaN NaN NaN NaN NaN NaN \n", + "260 ... NaN NaN NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "28 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "70 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "233 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "234 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "260 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest source_image \n", + "28 NaN NaN \n", + "70 NaN NaN \n", + "233 NaN NaN \n", + "234 NaN NaN \n", + "260 NaN NaN \n", + "\n", + "[5 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
28Subscription19271928https://shakespeareandco.princeton.edu/members...Arthur Elliott FelkinFelkin, Arthur ElliottNaNNaN1 year365.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
70Subscription19311932https://shakespeareandco.princeton.edu/members...Geraldine Deknatel;William DeknatelDeknatel, Geraldine;Deknatel, WilliamNaNNaN1 year365.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
233Subscription1921-071921-08https://shakespeareandco.princeton.edu/members...Mrs. G. S. MadamMadam, Mrs. G. S.NaNNaN1 month31.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
234Subscription1921-091922-02https://shakespeareandco.princeton.edu/members...Anne Moderwell;Hiram Moderwell / H. K. ModerwellModerwell, Anne;Moderwell, HiramNaNNaN5 months153.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
260Subscription1923-061923-10https://shakespeareandco.princeton.edu/members...Victor LlonaLlona, VictorNaNNaN4 months122.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
\n", + "

5 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what do the subscription duration day values look like?\n", + "subs_duration.subscription_duration_days.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9i0aN7iQ6voY", + "outputId": "fe1ac93f-5571-4bd3-e4c1-06e90cf33f5c" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "31.0 2997\n", + "30.0 1975\n", + "92.0 936\n", + "91.0 397\n", + "365.0 337\n", + " ... \n", + "69.0 1\n", + "36.0 1\n", + "73.0 1\n", + "574.0 1\n", + "171.0 1\n", + "Name: subscription_duration_days, Length: 133, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "subs_duration.subscription_duration_days.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aGqi4LRp60tV", + "outputId": "fbd61c94-41ab-40a7-87c2-cf0548c75d5a" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "count 9146.000000\n", + "mean 72.142685\n", + "std 81.559368\n", + "min 1.000000\n", + "25% 30.000000\n", + "50% 31.000000\n", + "75% 91.000000\n", + "max 574.000000\n", + "Name: subscription_duration_days, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# do we have unknown start/end date values?\n", + "subs_duration[subs_duration.start_date.isna()]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "pUsAb16MKqvb", + "outputId": "27f3b8e7-c5a5-4297-eb7e-e37e81945dda" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Empty DataFrame\n", + "Columns: [event_type, start_date, end_date, member_uris, member_names, member_sort_names, subscription_price_paid, subscription_deposit, subscription_duration, subscription_duration_days, subscription_volumes, subscription_category, subscription_purchase_date, reimbursement_refund, borrow_status, borrow_duration_days, purchase_price, currency, item_uri, item_title, item_volume, item_authors, item_year, item_notes, source_type, source_citation, source_manifest, source_image]\n", + "Index: []\n", + "\n", + "[0 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
\n", + "

0 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "subs_duration[subs_duration.end_date.isna()]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + }, + "id": "0odaog0eK0CN", + "outputId": "1e8814ff-0043-4969-b1d1-7574c3e82008" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "13168 Subscription 1932-10-06 NaN \n", + "13686 Subscription 1933-03-02 NaN \n", + "\n", + " member_uris \\\n", + "13168 https://shakespeareandco.princeton.edu/members... \n", + "13686 https://shakespeareandco.princeton.edu/members... \n", + "\n", + " member_names \\\n", + "13168 Jean (Bakewell) Connolly / Mrs. Cyril Connolly \n", + "13686 Stanislas Pascal Franchot \n", + "\n", + " member_sort_names subscription_price_paid \\\n", + "13168 Connolly, Jean NaN \n", + "13686 Franchot, Stanislas Pascal NaN \n", + "\n", + " subscription_deposit subscription_duration subscription_duration_days \\\n", + "13168 100.0 NaN 31.0 \n", + "13686 50.0 NaN 31.0 \n", + "\n", + " ... item_uri item_title item_volume item_authors item_year \\\n", + "13168 ... NaN NaN NaN NaN NaN \n", + "13686 ... NaN NaN NaN NaN NaN \n", + "\n", + " item_notes source_type \\\n", + "13168 NaN Logbook \n", + "13686 NaN Logbook;Lending Library Card \n", + "\n", + " source_citation \\\n", + "13168 Sylvia Beach, Logbooks 1919–1941, Sylvia Beach... \n", + "13686 Sylvia Beach, Logbooks 1919–1941, Sylvia Beach... \n", + "\n", + " source_manifest \\\n", + "13168 NaN \n", + "13686 ;https://figgy.princeton.edu/concern/scanned_r... \n", + "\n", + " source_image \n", + "13168 NaN \n", + "13686 ;https://iiif.princeton.edu/loris/figgy_prod/7... \n", + "\n", + "[2 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
13168Subscription1932-10-06NaNhttps://shakespeareandco.princeton.edu/members...Jean (Bakewell) Connolly / Mrs. Cyril ConnollyConnolly, JeanNaN100.0NaN31.0...NaNNaNNaNNaNNaNNaNLogbookSylvia Beach, Logbooks 1919–1941, Sylvia Beach...NaNNaN
13686Subscription1933-03-02NaNhttps://shakespeareandco.princeton.edu/members...Stanislas Pascal FranchotFranchot, Stanislas PascalNaN50.0NaN31.0...NaNNaNNaNNaNNaNNaNLogbook;Lending Library CardSylvia Beach, Logbooks 1919–1941, Sylvia Beach...;https://figgy.princeton.edu/concern/scanned_r...;https://iiif.princeton.edu/loris/figgy_prod/7...
\n", + "

2 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# omit events with unknown end date since we can't recalculate duration\n", + "# (duration in the dataset is based on the subscription duration)\n", + "subs_duration = subs_duration[subs_duration.end_date.notna()]" + ], + "metadata": { + "id": "jwvN9-CgLQRx" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# add a new field for duration as calculated by undate\n", + "subs_duration[\"undate_duration\"] = subs_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "subs_duration.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "id": "Z-CVWd3z7Jb6", + "outputId": "d52d57d4-9803-4bfa-9708-bdf149c7098b" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "28 Subscription 1927 1928 \n", + "70 Subscription 1931 1932 \n", + "233 Subscription 1921-07 1921-08 \n", + "234 Subscription 1921-09 1922-02 \n", + "260 Subscription 1923-06 1923-10 \n", + "\n", + " member_uris \\\n", + "28 https://shakespeareandco.princeton.edu/members... \n", + "70 https://shakespeareandco.princeton.edu/members... \n", + "233 https://shakespeareandco.princeton.edu/members... \n", + "234 https://shakespeareandco.princeton.edu/members... \n", + "260 https://shakespeareandco.princeton.edu/members... \n", + "\n", + " member_names \\\n", + "28 Arthur Elliott Felkin \n", + "70 Geraldine Deknatel;William Deknatel \n", + "233 Mrs. G. S. Madam \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell \n", + "260 Victor Llona \n", + "\n", + " member_sort_names subscription_price_paid \\\n", + "28 Felkin, Arthur Elliott NaN \n", + "70 Deknatel, Geraldine;Deknatel, William NaN \n", + "233 Madam, Mrs. G. S. NaN \n", + "234 Moderwell, Anne;Moderwell, Hiram NaN \n", + "260 Llona, Victor NaN \n", + "\n", + " subscription_deposit subscription_duration subscription_duration_days \\\n", + "28 NaN 1 year 365.0 \n", + "70 NaN 1 year 365.0 \n", + "233 NaN 1 month 31.0 \n", + "234 NaN 5 months 153.0 \n", + "260 NaN 4 months 122.0 \n", + "\n", + " ... item_title item_volume item_authors item_year item_notes \\\n", + "28 ... NaN NaN NaN NaN NaN \n", + "70 ... NaN NaN NaN NaN NaN \n", + "233 ... NaN NaN NaN NaN NaN \n", + "234 ... NaN NaN NaN NaN NaN \n", + "260 ... NaN NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "28 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "70 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "233 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "234 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "260 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest source_image undate_duration \n", + "28 NaN NaN 730 \n", + "70 NaN NaN 730 \n", + "233 NaN NaN 61 \n", + "234 NaN NaN 180 \n", + "260 NaN NaN 152 \n", + "\n", + "[5 rows x 29 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_imageundate_duration
28Subscription19271928https://shakespeareandco.princeton.edu/members...Arthur Elliott FelkinFelkin, Arthur ElliottNaNNaN1 year365.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN730
70Subscription19311932https://shakespeareandco.princeton.edu/members...Geraldine Deknatel;William DeknatelDeknatel, Geraldine;Deknatel, WilliamNaNNaN1 year365.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN730
233Subscription1921-071921-08https://shakespeareandco.princeton.edu/members...Mrs. G. S. MadamMadam, Mrs. G. S.NaNNaN1 month31.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN61
234Subscription1921-091922-02https://shakespeareandco.princeton.edu/members...Anne Moderwell;Hiram Moderwell / H. K. ModerwellModerwell, Anne;Moderwell, HiramNaNNaN5 months153.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN180
260Subscription1923-061923-10https://shakespeareandco.princeton.edu/members...Victor LlonaLlona, VictorNaNNaN4 months122.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN152
\n", + "

5 rows × 29 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# compare undate duration with dataset duration\n", + "# limit to fields we care about\n", + "subs_duration = subs_duration[['start_date', 'end_date', 'subscription_duration', 'subscription_duration_days', 'undate_duration']]\n", + "subs_duration" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "fVf6M2E2LgnH", + "outputId": "87e6585a-670d-466e-d206-caabaaa48df9" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "233 1921-07 1921-08 1 month \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "... ... ... ... \n", + "35114 1941-11-24 1941-12-24 1 month \n", + "35115 1941-11-24 1941-12-24 1 month \n", + "35116 1941-12-04 1942-01-04 1 month \n", + "35118 1941-12-08 1942-03-08 3 months \n", + "35119 1941-12-09 1942-01-09 1 month \n", + "\n", + " subscription_duration_days undate_duration \n", + "28 365.0 730 \n", + "70 365.0 730 \n", + "233 31.0 61 \n", + "234 153.0 180 \n", + "260 122.0 152 \n", + "... ... ... \n", + "35114 30.0 30 \n", + "35115 30.0 30 \n", + "35116 31.0 31 \n", + "35118 90.0 90 \n", + "35119 31.0 31 \n", + "\n", + "[9144 rows x 5 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_duration
28192719281 year365.0730
70193119321 year365.0730
2331921-071921-081 month31.061
2341921-091922-025 months153.0180
2601923-061923-104 months122.0152
..................
351141941-11-241941-12-241 month30.030
351151941-11-241941-12-241 month30.030
351161941-12-041942-01-041 month31.031
351181941-12-081942-03-083 months90.090
351191941-12-091942-01-091 month31.031
\n", + "

9144 rows × 5 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what's the difference between the two?\n", + "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration - row.subscription_duration_days, axis=1)\n", + "subs_duration" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "drnCqTtsL835", + "outputId": "dc042b74-295a-436c-9c70-c6014d986cf7" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "233 1921-07 1921-08 1 month \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "... ... ... ... \n", + "35114 1941-11-24 1941-12-24 1 month \n", + "35115 1941-11-24 1941-12-24 1 month \n", + "35116 1941-12-04 1942-01-04 1 month \n", + "35118 1941-12-08 1942-03-08 3 months \n", + "35119 1941-12-09 1942-01-09 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "28 365.0 730 365.0 \n", + "70 365.0 730 365.0 \n", + "233 31.0 61 30.0 \n", + "234 153.0 180 27.0 \n", + "260 122.0 152 30.0 \n", + "... ... ... ... \n", + "35114 30.0 30 0.0 \n", + "35115 30.0 30 0.0 \n", + "35116 31.0 31 0.0 \n", + "35118 90.0 90 0.0 \n", + "35119 31.0 31 0.0 \n", + "\n", + "[9144 rows x 6 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28192719281 year365.0730365.0
70193119321 year365.0730365.0
2331921-071921-081 month31.06130.0
2341921-091922-025 months153.018027.0
2601923-061923-104 months122.015230.0
.....................
351141941-11-241941-12-241 month30.0300.0
351151941-11-241941-12-241 month30.0300.0
351161941-12-041942-01-041 month31.0310.0
351181941-12-081942-03-083 months90.0900.0
351191941-12-091942-01-091 month31.0310.0
\n", + "

9144 rows × 6 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "source": [ + "subs_duration['duration_diff'].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "z3i984igMNjm", + "outputId": "c8a3580e-a36a-4756-d427-286ba8e5cf91" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0.0 9065\n", + " 30.0 30\n", + " 29.0 21\n", + " 1.0 10\n", + "-1.0 9\n", + " 28.0 4\n", + " 365.0 2\n", + " 27.0 1\n", + " 2.0 1\n", + "-3.0 1\n", + "Name: duration_diff, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### investigate discrepancies" + ], + "metadata": { + "id": "Uu9kmAA_gm5o" + } + }, + { + "cell_type": "code", + "source": [ + "# investigate the ones with larger differences\n", + "subset_subdurations = subs_duration[subs_duration.duration_diff != 0]\n", + "subset_subdurations" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "gdenGvR1MkUG", + "outputId": "589b6b49-3f9c-42d5-e01f-326401007878" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "233 1921-07 1921-08 1 month \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "... ... ... ... \n", + "33878 1933-12-01 1934-01-01 1 month \n", + "33880 1933-12-02 1934-01-02 1 month \n", + "33902 1934-01-02 1934-07-02 6 months \n", + "33936 1934-06-02 1934-12-02 6 months \n", + "34892 1940-11-30 1940-12-30 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "28 365.0 730 365.0 \n", + "70 365.0 730 365.0 \n", + "233 31.0 61 30.0 \n", + "234 153.0 180 27.0 \n", + "260 122.0 152 30.0 \n", + "... ... ... ... \n", + "33878 30.0 31 1.0 \n", + "33880 30.0 31 1.0 \n", + "33902 182.0 181 -1.0 \n", + "33936 184.0 183 -1.0 \n", + "34892 31.0 30 -1.0 \n", + "\n", + "[79 rows x 6 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28192719281 year365.0730365.0
70193119321 year365.0730365.0
2331921-071921-081 month31.06130.0
2341921-091922-025 months153.018027.0
2601923-061923-104 months122.015230.0
.....................
338781933-12-011934-01-011 month30.0311.0
338801933-12-021934-01-021 month30.0311.0
339021934-01-021934-07-026 months182.0181-1.0
339361934-06-021934-12-026 months184.0183-1.0
348921940-11-301940-12-301 month31.030-1.0
\n", + "

79 rows × 6 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# too many to lok at once, can we segment by subscription duration?\n", + "subset_subdurations.subscription_duration.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9_w1Cwl2N81d", + "outputId": "c0733942-16cd-42bf-c9a3-abbf250e44f5" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1 month 38\n", + "3 months 12\n", + "2 months 7\n", + "6 months 6\n", + "4 months 5\n", + "5 months 3\n", + "1 year 2\n", + "7 months 2\n", + "8 months 2\n", + "11 months 1\n", + "10 months 1\n", + "Name: subscription_duration, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# lots of one-month subscriptions, what do the discrepancies look like?\n", + "subset_subdurations[subset_subdurations.subscription_duration == '1 month']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "snv1qguUOHPB", + "outputId": "dce76078-236b-48ee-9607-5d702cf4ee04" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "233 1921-07 1921-08 1 month \n", + "261 1923-08 1923-09 1 month \n", + "271 1924-02 1924-03 1 month \n", + "313 1926-11 1926-12 1 month \n", + "354 1928-02 1928-03 1 month \n", + "356 1928-02 1928-03 1 month \n", + "393 1929-08 1929-09 1 month \n", + "394 1929-08 1929-09 1 month \n", + "430 1930-05 1930-06 1 month \n", + "444 1930-11 1930-12 1 month \n", + "462 1931-05 1931-06 1 month \n", + "464 1931-06 1931-07 1 month \n", + "466 1931-07 1931-08 1 month \n", + "468 1931-08 1931-09 1 month \n", + "472 1931-09 1931-10 1 month \n", + "477 1931-10 1931-11 1 month \n", + "478 1931-10 1931-11 1 month \n", + "483 1931-11 1931-12 1 month \n", + "484 1931-11 1931-12 1 month \n", + "487 1931-12 1932-01 1 month \n", + "492 1932-01 1932-02 1 month \n", + "500 1932-02 1932-03 1 month \n", + "501 1932-02 1932-03 1 month \n", + "504 1932-03 1932-04 1 month \n", + "516 1932-04 1932-05 1 month \n", + "517 1932-05 1932-06 1 month \n", + "7064 1926-09-15 1926-10-15 1 month \n", + "31089 1923-11-22 1923-12-22 1 month \n", + "31511 1924-11-08 1924-12-08 1 month \n", + "31722 1925-05-09 1925-06-09 1 month \n", + "32269 1926-06-10 1926-07-10 1 month \n", + "32444 1926-10-07 1926-11-07 1 month \n", + "33401 1929-05-18 1929-06-18 1 month \n", + "33665 1932-12-15 1933-01-15 1 month \n", + "33709 1933-02-03 1933-03-03 1 month \n", + "33878 1933-12-01 1934-01-01 1 month \n", + "33880 1933-12-02 1934-01-02 1 month \n", + "34892 1940-11-30 1940-12-30 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "233 31.0 61 30.0 \n", + "261 31.0 60 29.0 \n", + "271 29.0 59 30.0 \n", + "313 30.0 60 30.0 \n", + "354 29.0 59 30.0 \n", + "356 29.0 59 30.0 \n", + "393 31.0 60 29.0 \n", + "394 31.0 60 29.0 \n", + "430 31.0 60 29.0 \n", + "444 30.0 60 30.0 \n", + "462 31.0 60 29.0 \n", + "464 30.0 60 30.0 \n", + "466 31.0 61 30.0 \n", + "468 31.0 60 29.0 \n", + "472 30.0 60 30.0 \n", + "477 31.0 60 29.0 \n", + "478 31.0 60 29.0 \n", + "483 30.0 60 30.0 \n", + "484 30.0 60 30.0 \n", + "487 31.0 61 30.0 \n", + "492 31.0 59 28.0 \n", + "500 29.0 59 30.0 \n", + "501 29.0 59 30.0 \n", + "504 31.0 60 29.0 \n", + "516 30.0 60 30.0 \n", + "517 31.0 60 29.0 \n", + "7064 31.0 30 -1.0 \n", + "31089 31.0 30 -1.0 \n", + "31511 31.0 30 -1.0 \n", + "31722 30.0 31 1.0 \n", + "32269 31.0 30 -1.0 \n", + "32444 30.0 31 1.0 \n", + "33401 30.0 31 1.0 \n", + "33665 30.0 31 1.0 \n", + "33709 31.0 28 -3.0 \n", + "33878 30.0 31 1.0 \n", + "33880 30.0 31 1.0 \n", + "34892 31.0 30 -1.0 " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
2331921-071921-081 month31.06130.0
2611923-081923-091 month31.06029.0
2711924-021924-031 month29.05930.0
3131926-111926-121 month30.06030.0
3541928-021928-031 month29.05930.0
3561928-021928-031 month29.05930.0
3931929-081929-091 month31.06029.0
3941929-081929-091 month31.06029.0
4301930-051930-061 month31.06029.0
4441930-111930-121 month30.06030.0
4621931-051931-061 month31.06029.0
4641931-061931-071 month30.06030.0
4661931-071931-081 month31.06130.0
4681931-081931-091 month31.06029.0
4721931-091931-101 month30.06030.0
4771931-101931-111 month31.06029.0
4781931-101931-111 month31.06029.0
4831931-111931-121 month30.06030.0
4841931-111931-121 month30.06030.0
4871931-121932-011 month31.06130.0
4921932-011932-021 month31.05928.0
5001932-021932-031 month29.05930.0
5011932-021932-031 month29.05930.0
5041932-031932-041 month31.06029.0
5161932-041932-051 month30.06030.0
5171932-051932-061 month31.06029.0
70641926-09-151926-10-151 month31.030-1.0
310891923-11-221923-12-221 month31.030-1.0
315111924-11-081924-12-081 month31.030-1.0
317221925-05-091925-06-091 month30.0311.0
322691926-06-101926-07-101 month31.030-1.0
324441926-10-071926-11-071 month30.0311.0
334011929-05-181929-06-181 month30.0311.0
336651932-12-151933-01-151 month30.0311.0
337091933-02-031933-03-031 month31.028-3.0
338781933-12-011934-01-011 month30.0311.0
338801933-12-021934-01-021 month30.0311.0
348921940-11-301940-12-301 month31.030-1.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The first set of these are calculated differently because they are partial dates; undate logic calculates based on earliest possible date through last possible date, but we have additional information in these cases that is project-specific and undate can't take into account, i.e. subscription duration is one month starting sometime in a known year or month.\n", + "\n", + "The handful towards the end that are off by one in either direction (+/-) are a little more concerning... (potential bug in S&co code? or value calculated based on known semantic duration?)" + ], + "metadata": { + "id": "Rm4jqlA4hq9E" + } + }, + { + "cell_type": "code", + "source": [ + "# durations other than one month\n", + "subset_subdurations[subset_subdurations.subscription_duration != '1 month']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "TEL7qdNhOXHL", + "outputId": "50e051d5-18ae-4f24-a229-fc02fb610ed8" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "272 1924-02 1924-04 2 months \n", + "293 1926-03 1926-10 7 months \n", + "321 1927-03 1928-02 11 months \n", + "331 1927-07 1927-10 3 months \n", + "337 1927-10 1928-06 8 months \n", + "349 1928-01 1928-04 3 months \n", + "388 1929-06 1930-04 10 months \n", + "408 1930-01 1930-04 3 months \n", + "409 1930-01 1930-04 3 months \n", + "412 1930-01 1930-09 8 months \n", + "415 1930-02 1930-06 4 months \n", + "431 1930-05 1930-07 2 months \n", + "437 1930-09 1930-12 3 months \n", + "454 1930-12 1931-03 3 months \n", + "459 1931-03 1931-05 2 months \n", + "465 1931-07 1931-10 3 months \n", + "471 1931-09 1931-12 3 months \n", + "475 1931-09 1931-12 3 months \n", + "476 1931-10 1932-03 5 months \n", + "480 1931-10 1932-02 4 months \n", + "485 1931-11 1932-06 7 months \n", + "486 1931-12 1932-05 5 months \n", + "489 1931-12 1932-02 2 months \n", + "490 1931-12 1932-04 4 months \n", + "496 1932-01 1932-03 2 months \n", + "502 1932-02 1932-06 4 months \n", + "506 1932-03 1932-05 2 months \n", + "507 1932-03 1932-05 2 months \n", + "709 1919-12-02 1920-06-02 6 months \n", + "7560 1927-01-11 1927-04-11 3 months \n", + "31480 1924-10-17 1925-04-17 6 months \n", + "31917 1925-10-21 1926-01-21 3 months \n", + "32613 1927-03-14 1927-06-14 3 months \n", + "32671 1927-06-14 1927-12-14 6 months \n", + "32869 1927-12-14 1928-06-14 6 months \n", + "33902 1934-01-02 1934-07-02 6 months \n", + "33936 1934-06-02 1934-12-02 6 months \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "28 365.0 730 365.0 \n", + "70 365.0 730 365.0 \n", + "234 153.0 180 27.0 \n", + "260 122.0 152 30.0 \n", + "272 60.0 89 29.0 \n", + "293 214.0 244 30.0 \n", + "321 337.0 365 28.0 \n", + "331 92.0 122 30.0 \n", + "337 244.0 273 29.0 \n", + "349 91.0 120 29.0 \n", + "388 304.0 333 29.0 \n", + "408 90.0 119 29.0 \n", + "409 90.0 119 29.0 \n", + "412 243.0 272 29.0 \n", + "415 120.0 149 29.0 \n", + "431 61.0 91 30.0 \n", + "437 91.0 121 30.0 \n", + "454 90.0 120 30.0 \n", + "459 61.0 91 30.0 \n", + "465 92.0 122 30.0 \n", + "471 91.0 121 30.0 \n", + "475 91.0 121 30.0 \n", + "476 152.0 182 30.0 \n", + "480 123.0 151 28.0 \n", + "485 213.0 242 29.0 \n", + "486 152.0 182 30.0 \n", + "489 62.0 90 28.0 \n", + "490 122.0 151 29.0 \n", + "496 60.0 90 30.0 \n", + "502 121.0 150 29.0 \n", + "506 61.0 91 30.0 \n", + "507 61.0 91 30.0 \n", + "709 182.0 183 1.0 \n", + "7560 91.0 90 -1.0 \n", + "31480 181.0 182 1.0 \n", + "31917 91.0 92 1.0 \n", + "32613 90.0 92 2.0 \n", + "32671 184.0 183 -1.0 \n", + "32869 182.0 183 1.0 \n", + "33902 182.0 181 -1.0 \n", + "33936 184.0 183 -1.0 " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28192719281 year365.0730365.0
70193119321 year365.0730365.0
2341921-091922-025 months153.018027.0
2601923-061923-104 months122.015230.0
2721924-021924-042 months60.08929.0
2931926-031926-107 months214.024430.0
3211927-031928-0211 months337.036528.0
3311927-071927-103 months92.012230.0
3371927-101928-068 months244.027329.0
3491928-011928-043 months91.012029.0
3881929-061930-0410 months304.033329.0
4081930-011930-043 months90.011929.0
4091930-011930-043 months90.011929.0
4121930-011930-098 months243.027229.0
4151930-021930-064 months120.014929.0
4311930-051930-072 months61.09130.0
4371930-091930-123 months91.012130.0
4541930-121931-033 months90.012030.0
4591931-031931-052 months61.09130.0
4651931-071931-103 months92.012230.0
4711931-091931-123 months91.012130.0
4751931-091931-123 months91.012130.0
4761931-101932-035 months152.018230.0
4801931-101932-024 months123.015128.0
4851931-111932-067 months213.024229.0
4861931-121932-055 months152.018230.0
4891931-121932-022 months62.09028.0
4901931-121932-044 months122.015129.0
4961932-011932-032 months60.09030.0
5021932-021932-064 months121.015029.0
5061932-031932-052 months61.09130.0
5071932-031932-052 months61.09130.0
7091919-12-021920-06-026 months182.01831.0
75601927-01-111927-04-113 months91.090-1.0
314801924-10-171925-04-176 months181.01821.0
319171925-10-211926-01-213 months91.0921.0
326131927-03-141927-06-143 months90.0922.0
326711927-06-141927-12-146 months184.0183-1.0
328691927-12-141928-06-146 months182.01831.0
339021934-01-021934-07-026 months182.0181-1.0
339361934-06-021934-12-026 months184.0183-1.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## borrow events\n", + "\n", + "compare borrow events with known duration" + ], + "metadata": { + "id": "2tk6N7SXKKCu" + } + }, + { + "cell_type": "code", + "source": [ + "borrow_duration = events_df[events_df.borrow_duration_days.notna()]\n", + "# limit to fields we care about for this check\n", + "borrow_duration = borrow_duration[['start_date', 'end_date', 'borrow_duration_days']]\n", + "borrow_duration.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "fA1Nedmz6cyF", + "outputId": "5230d5ad-fec4-4353-a0d2-9676d1aa776d" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days\n", + "602 --01-07 --01-13 6.0\n", + "603 --01-12 --01-20 8.0\n", + "604 --01-16 --02-16 31.0\n", + "605 --01-19 --01-24 5.0\n", + "606 --01-20 --01-28 8.0" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_days
602--01-07--01-136.0
603--01-12--01-208.0
604--01-16--02-1631.0
605--01-19--01-245.0
606--01-20--01-288.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "source": [ + "borrow_duration.tail()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "KPOBIRsTUKM9", + "outputId": "4a251445-e7c7-4250-82df-ece0bc9a3d56" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days\n", + "29903 1961-06-30 1961-10-04 96.0\n", + "29904 1961-06-30 1961-10-04 96.0\n", + "29905 1961-06-30 1961-10-04 96.0\n", + "29907 1961-10-04 1962-03-21 168.0\n", + "29908 1961-10-04 1962-03-21 168.0" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_days
299031961-06-301961-10-0496.0
299041961-06-301961-10-0496.0
299051961-06-301961-10-0496.0
299071961-10-041962-03-21168.0
299081961-10-041962-03-21168.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# add a new field for duration as calculated by undate\n", + "borrow_duration[\"undate_duration\"] = borrow_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "borrow_duration.head(10)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 + }, + "id": "39nEPZva8jDo", + "outputId": "6cff4de2-c188-43ad-dc75-684c4d461029" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days undate_duration\n", + "602 --01-07 --01-13 6.0 6\n", + "603 --01-12 --01-20 8.0 8\n", + "604 --01-16 --02-16 31.0 31\n", + "605 --01-19 --01-24 5.0 5\n", + "606 --01-20 --01-28 8.0 8\n", + "607 --01-24 --03-20 55.0 55\n", + "608 --01-24 --03-20 55.0 55\n", + "609 --01-24 --03-20 55.0 55\n", + "610 --01-24 --05-30 126.0 126\n", + "611 --01-24 --05-30 126.0 126" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_daysundate_duration
602--01-07--01-136.06
603--01-12--01-208.08
604--01-16--02-1631.031
605--01-19--01-245.05
606--01-20--01-288.08
607--01-24--03-2055.055
608--01-24--03-2055.055
609--01-24--03-2055.055
610--01-24--05-30126.0126
611--01-24--05-30126.0126
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what's the difference between the two?\n", + "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration - row.borrow_duration_days, axis=1)\n", + "borrow_duration" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "rL5S47wPWfd-", + "outputId": "127af40e-0037-4f99-d590-9cc2466a206b" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days undate_duration \\\n", + "602 --01-07 --01-13 6.0 6 \n", + "603 --01-12 --01-20 8.0 8 \n", + "604 --01-16 --02-16 31.0 31 \n", + "605 --01-19 --01-24 5.0 5 \n", + "606 --01-20 --01-28 8.0 8 \n", + "... ... ... ... ... \n", + "29903 1961-06-30 1961-10-04 96.0 96 \n", + "29904 1961-06-30 1961-10-04 96.0 96 \n", + "29905 1961-06-30 1961-10-04 96.0 96 \n", + "29907 1961-10-04 1962-03-21 168.0 168 \n", + "29908 1961-10-04 1962-03-21 168.0 168 \n", + "\n", + " duration_diff \n", + "602 0.0 \n", + "603 0.0 \n", + "604 0.0 \n", + "605 0.0 \n", + "606 0.0 \n", + "... ... \n", + "29903 0.0 \n", + "29904 0.0 \n", + "29905 0.0 \n", + "29907 0.0 \n", + "29908 0.0 \n", + "\n", + "[19728 rows x 5 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_daysundate_durationduration_diff
602--01-07--01-136.060.0
603--01-12--01-208.080.0
604--01-16--02-1631.0310.0
605--01-19--01-245.050.0
606--01-20--01-288.080.0
..................
299031961-06-301961-10-0496.0960.0
299041961-06-301961-10-0496.0960.0
299051961-06-301961-10-0496.0960.0
299071961-10-041962-03-21168.01680.0
299081961-10-041962-03-21168.01680.0
\n", + "

19728 rows × 5 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what do the duration differences look like?\n", + "borrow_duration.duration_diff.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DQumLSXZW7r6", + "outputId": "fc5196d6-9d9a-430e-ecb2-c142676c3614" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.0 19728\n", + "Name: duration_diff, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Woohoo, everything matches! 🎉\n", + "\n", + "In a previous run, there were two borrow events where the calculation did not match; this was due to an error in undate duration method when the start and end dates have unknown years and dates wrap to the following year (e.g., december to january), which has now been corrected.\n", + "\n", + "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#)." + ], + "metadata": { + "id": "r0TUYWzSXIil" + } + }, + { + "cell_type": "code", + "source": [ + "borrow_duration[borrow_duration.duration_diff != 0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 + }, + "id": "-Bq76gtDWljg", + "outputId": "f1ee526d-b938-4cbf-e93c-c6c91c077ae7" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Empty DataFrame\n", + "Columns: [start_date, end_date, borrow_duration_days, undate_duration, duration_diff]\n", + "Index: []" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_daysundate_durationduration_diff
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ] + } + ] +} \ No newline at end of file From 698e68a8a6e171e9b79315edb77abdfeeafdec48 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 14 Jul 2023 09:26:24 -0400 Subject: [PATCH 13/13] Pin sphinx to pre 7.0 for compatibility with rtd theme --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 8d04412..217c8ee 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,9 +58,10 @@ test = pytest-ordering pytest-cov docs = - sphinx + sphinx<7.0.0 sphinx_rtd_theme m2r2 +# pin sphinx because 7.0 currently not compatible with rtd theme [options.packages.find] where = src