'''Date and time parsing'''
import datetime
import json
import os.path
import re
import sys
import unicodedata
import pprint
from typing import Iterable, Tuple, Optional
AM_STRINGS = {'a. m', 'am', 'पूर्व', 'vorm', 'ص', '上午', '午前'}
'''Set of AM day period strings.'''
PM_STRINGS = {'nachm', 'अपर', 'م', 'p. m', '下午', 'pm', '午後'}
'''Set of PM day period strings.'''
MONTH_MAP = {
'10月': 10,
'11月': 11,
'12月': 12,
'1月': 1,
'2月': 2,
'3月': 3,
'4月': 4,
'5月': 5,
'6月': 6,
'7月': 7,
'8月': 8,
'9月': 9,
'abr': 4,
'ago': 8,
'août': 8,
'apr': 4,
'aug': 8,
'avr': 4,
'cze': 6,
'dec': 12,
'dez': 12,
'déc': 12,
'dic': 12,
'ene': 1,
'feb': 2,
'fev': 2,
'févr': 2,
'gru': 12,
'jan': 1,
'janv': 1,
'juil': 7,
'juin': 6,
'jul': 7,
'juli': 7,
'jun': 6,
'juni': 6,
'kwi': 4,
'lip': 7,
'lis': 11,
'lut': 2,
'mai': 5,
'maj': 5,
'mar': 3,
'mars': 3,
'may': 5,
'märz': 3,
'nov': 11,
'oct': 10,
'okt': 10,
'out': 10,
'paź': 10,
'sep': 9,
'sept': 9,
'set': 9,
'sie': 8,
'sty': 1,
'wrz': 9,
'авг': 8,
'апр': 4,
'дек': 12,
'июля': 7,
'июня': 6,
'марта': 3,
'мая': 5,
'нояб': 11,
'окт': 10,
'сент': 9,
'февр': 2,
'янв': 1,
'أبريل': 4,
'أغسطس': 8,
'أكتوبر': 10,
'ديسمبر': 12,
'سبتمبر': 9,
'فبراير': 2,
'مارس': 3,
'مايو': 5,
'نوفمبر': 11,
'يناير': 1,
'يوليو': 7,
'يونيو': 6,
'अक्टू': 10,
'अग': 8,
'अप्रै': 4,
'जन': 1,
'जुला': 7,
'जून': 6,
'दिसं': 12,
'नवं': 11,
'फ़र': 2,
'मई': 5,
'मार्च': 3,
'सितं': 9
}
'''Month names to int.'''
DAY_PERIOD_PATTERN = re.compile(
r'({})\b'.format('|'.join(AM_STRINGS | PM_STRINGS)), re.IGNORECASE)
'''Regex pattern for AM/PM string.'''
ISO_8601_DATE_PATTERN = re.compile(r'(\d{4})(?!\d)[\w./-](\d{1,2})(?!\d)[\w./-](\d{1,2})')
'''Regex pattern for dates similar to YYYY-MM-DD.'''
MMM_DD_YY_PATTERN = re.compile(r'([^\W\d_]{3,4})\s{0,4}(\d{1,2})\s{0,4}(\d{0,4})')
'''Regex pattern for dates similar to MMM DD YY.
Example: Feb 09 90
'''
NN_NN_NNNN_PATTERN = re.compile(r'(\d{1,2})[./-](\d{1,2})[./-](\d{2,4})')
'''Regex pattern for dates similar to NN NN YYYY.
Example: 2/9/90
'''
TIME_PATTERN = re.compile(
r'(\d{1,2}):(\d{2}):?(\d{0,2})\s?(' +
'|'.join(AM_STRINGS | PM_STRINGS) + '|\b)?')
'''Regex pattern for time in HH:MM[:SS]'''
[docs]def parse_datetime(text: str, date_format: str=None,
is_day_period: Optional[bool]=None,
datetime_now: datetime.datetime=None) \
-> Tuple[datetime.datetime, int, int]:
'''Parse date/time from a line of text into datetime object.'''
datetime_now = datetime_now or \
datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
year = datetime_now.year
month = datetime_now.month
day = datetime_now.day
hour = 0
minute = 0
second = 0
date_ok = False
start_index = float('+inf')
end_index = float('-inf')
ambiguous_year = False
text = unicodedata.normalize('NFKD', text).lower()
# Let's do time first
match = TIME_PATTERN.search(text)
if match:
hour_str = match.group(1)
hour = int(hour_str)
minute = int(match.group(2))
day_period = match.group(4)
if match.group(3):
second = int(match.group(3))
if day_period and is_day_period and hour < 13:
if day_period.lower() in PM_STRINGS:
if hour != 12:
hour += 12
elif hour == 12:
hour = 0
start_index = match.start()
end_index = match.end()
# Now try dates
if date_format == 'ymd' or not date_format:
match = ISO_8601_DATE_PATTERN.search(text)
if match:
date_ok = True
year = int(match.group(1))
month = int(match.group(2))
day = int(match.group(3))
start_index = min(start_index, match.start())
end_index = max(end_index, match.end())
if not date_ok and (date_format == 'mdy' or not date_format):
match = MMM_DD_YY_PATTERN.search(text)
if match:
date_ok = True
month_str = match.group(1)
month = parse_month(month_str)
day = int(match.group(2))
year_str = match.group(3)
if year_str and len(year_str) == 4:
year = int(year_str)
else:
ambiguous_year = True
start_index = min(start_index, match.start())
end_index = max(end_index, match.end())
if not date_ok:
match = NN_NN_NNNN_PATTERN.search(text)
if match:
date_ok = True
num_1 = int(match.group(1))
num_2 = int(match.group(2))
year = int(match.group(3))
if year < 100:
year = y2k(year)
if date_format == 'mdy' or num_2 > 12:
month = num_1
day = num_2
else:
day = num_1
month = num_2
start_index = min(start_index, match.start())
end_index = max(end_index, match.end())
if date_ok:
guess_date = datetime.datetime(year, month, day, hour, minute, second,
tzinfo=datetime.timezone.utc)
if ambiguous_year and guess_date > datetime_now:
# Sometimes year is not shown within 6 months
# Year is shown for dates in the future
guess_date = guess_date.replace(year=year - 1)
return guess_date, start_index, end_index
else:
raise ValueError('Failed to parse date from {}'.format(repr(text)))
[docs]def parse_month(text: str) -> int:
'''Parse month string into integer.'''
text = text.lower()
try:
return MONTH_MAP[text]
except KeyError:
pass
try:
return MONTH_MAP[text[:3]]
except KeyError:
pass
raise ValueError('Month {} not found.'.format(repr(text)))
[docs]def y2k(year: int) -> int:
'''Convert two digit year to four digit year.'''
assert 0 <= year <= 99, 'Not a two digit year {}'.format(year)
return year + 1000 if year >= 69 else year + 2000
DEFAULT_LANGUAGE_CODES = (
'zh', 'es', 'en', 'hi', 'ar',
'pt', 'ru', 'ja',
'de', 'fr', 'pl',
)
[docs]def parse_cldr_json(directory, language_codes=DEFAULT_LANGUAGE_CODES,
massage=True):
'''Parse CLDR JSON datasets to for date time things.'''
am_strings = set()
pm_strings = set()
month_to_int = {}
for lang in language_codes:
path = os.path.join(directory, 'main', lang, 'ca-gregorian.json')
with open(path) as in_file:
doc = json.load(in_file)
months_dict = doc['main'][lang]['dates']['calendars']['gregorian']['months']['format']['abbreviated']
day_periods_dict = doc['main'][lang]['dates']['calendars']['gregorian']['dayPeriods']['format']['abbreviated']
for month, month_str in months_dict.items():
if massage:
month_str = unicodedata.normalize('NFKD', month_str).lower().strip('.')
month_to_int[month_str] = int(month)
am_str = day_periods_dict['am']
pm_str = day_periods_dict['pm']
if massage:
am_str = unicodedata.normalize('NFKD', am_str).lower().strip('.')
pm_str = unicodedata.normalize('NFKD', pm_str).lower().strip('.')
am_strings.add(am_str)
pm_strings.add(pm_str)
print(pprint.pformat(am_strings))
print(pprint.pformat(pm_strings))
print(pprint.pformat(month_to_int))
if __name__ == '__main__':
parse_cldr_json(sys.argv[1])