Source code for wpull.protocol.ftp.ls.date

'''Date and time parsing'''
import datetime
import json
import os.path
import re
import sys
import unicodedata
import pprint

from typing import Iterable, Tuple, Optional

AM_STRINGS = {'a. m', 'am', 'पूर्व', 'vorm', 'ص', '上午', '午前'}
'''Set of AM day period strings.'''
PM_STRINGS = {'nachm', 'अपर', 'م', 'p. m', '下午', 'pm', '午後'}
'''Set of PM day period strings.'''

MONTH_MAP = {
    '10月': 10,
    '11月': 11,
    '12月': 12,
    '1月': 1,
    '2月': 2,
    '3月': 3,
    '4月': 4,
    '5月': 5,
    '6月': 6,
    '7月': 7,
    '8月': 8,
    '9月': 9,
    'abr': 4,
    'ago': 8,
    'août': 8,
    'apr': 4,
    'aug': 8,
    'avr': 4,
    'cze': 6,
    'dec': 12,
    'dez': 12,
    'déc': 12,
    'dic': 12,
    'ene': 1,
    'feb': 2,
    'fev': 2,
    'févr': 2,
    'gru': 12,
    'jan': 1,
    'janv': 1,
    'juil': 7,
    'juin': 6,
    'jul': 7,
    'juli': 7,
    'jun': 6,
    'juni': 6,
    'kwi': 4,
    'lip': 7,
    'lis': 11,
    'lut': 2,
    'mai': 5,
    'maj': 5,
    'mar': 3,
    'mars': 3,
    'may': 5,
    'märz': 3,
    'nov': 11,
    'oct': 10,
    'okt': 10,
    'out': 10,
    'paź': 10,
    'sep': 9,
    'sept': 9,
    'set': 9,
    'sie': 8,
    'sty': 1,
    'wrz': 9,
    'авг': 8,
    'апр': 4,
    'дек': 12,
    'июля': 7,
    'июня': 6,
    'марта': 3,
    'мая': 5,
    'нояб': 11,
    'окт': 10,
    'сент': 9,
    'февр': 2,
    'янв': 1,
    'أبريل': 4,
    'أغسطس': 8,
    'أكتوبر': 10,
    'ديسمبر': 12,
    'سبتمبر': 9,
    'فبراير': 2,
    'مارس': 3,
    'مايو': 5,
    'نوفمبر': 11,
    'يناير': 1,
    'يوليو': 7,
    'يونيو': 6,
    'अक्टू': 10,
    'अग': 8,
    'अप्रै': 4,
    'जन': 1,
    'जुला': 7,
    'जून': 6,
    'दिसं': 12,
    'नवं': 11,
    'फ़र': 2,
    'मई': 5,
    'मार्च': 3,
    'सितं': 9
}
'''Month names to int.'''


DAY_PERIOD_PATTERN = re.compile(
    r'({})\b'.format('|'.join(AM_STRINGS | PM_STRINGS)), re.IGNORECASE)
'''Regex pattern for AM/PM string.'''

ISO_8601_DATE_PATTERN = re.compile(r'(\d{4})(?!\d)[\w./-](\d{1,2})(?!\d)[\w./-](\d{1,2})')
'''Regex pattern for dates similar to YYYY-MM-DD.'''

MMM_DD_YY_PATTERN = re.compile(r'([^\W\d_]{3,4})\s{0,4}(\d{1,2})\s{0,4}(\d{0,4})')
'''Regex pattern for dates similar to MMM DD YY.

Example: Feb 09 90
'''

NN_NN_NNNN_PATTERN = re.compile(r'(\d{1,2})[./-](\d{1,2})[./-](\d{2,4})')
'''Regex pattern for dates similar to NN NN YYYY.

Example: 2/9/90
'''

TIME_PATTERN = re.compile(
    r'(\d{1,2}):(\d{2}):?(\d{0,2})\s?(' +
    '|'.join(AM_STRINGS | PM_STRINGS) + '|\b)?')
'''Regex pattern for time in HH:MM[:SS]'''


[docs]def guess_datetime_format(lines: Iterable[str], threshold: int=5) \ -> Tuple[Optional[str], Optional[bool]]: '''Guess whether order of the year, month, day and 12/24 hour. Returns: tuple: First item is either str ``ymd``, ``dmy``, ``mdy`` or ``None``. Second item is either True for 12-hour time or False for 24-hour time or None. ''' time_12_score = 0 time_24_score = 0 date_ymd_score = 0 date_dmy_score = 0 date_mdy_score = 0 for line in lines: line = unicodedata.normalize('NFKD', line).lower() if DAY_PERIOD_PATTERN.search(line): time_12_score += 1 else: time_24_score += 1 if ISO_8601_DATE_PATTERN.search(line): date_ymd_score += 1 if MMM_DD_YY_PATTERN.search(line): date_mdy_score += 1 match = NN_NN_NNNN_PATTERN.search(line) if match: num_1 = int(match.group(1)) num_2 = int(match.group(2)) if num_1 > 12: date_dmy_score += 1 elif num_2 > 12: date_mdy_score += 1 time_score = time_12_score + time_24_score date_score = date_ymd_score + date_dmy_score + date_mdy_score if time_score >= threshold and date_score >= threshold: break if date_ymd_score or date_dmy_score or date_mdy_score: top = max([ (date_ymd_score, 'ymd'), (date_dmy_score, 'dmy'), (date_mdy_score, 'mdy'), ], key=lambda item: item[0] ) date_format = top[1] else: date_format = None if time_12_score or time_24_score: day_period = True if time_12_score > time_24_score else False else: day_period = None return date_format, day_period
[docs]def parse_datetime(text: str, date_format: str=None, is_day_period: Optional[bool]=None, datetime_now: datetime.datetime=None) \ -> Tuple[datetime.datetime, int, int]: '''Parse date/time from a line of text into datetime object.''' datetime_now = datetime_now or \ datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) year = datetime_now.year month = datetime_now.month day = datetime_now.day hour = 0 minute = 0 second = 0 date_ok = False start_index = float('+inf') end_index = float('-inf') ambiguous_year = False text = unicodedata.normalize('NFKD', text).lower() # Let's do time first match = TIME_PATTERN.search(text) if match: hour_str = match.group(1) hour = int(hour_str) minute = int(match.group(2)) day_period = match.group(4) if match.group(3): second = int(match.group(3)) if day_period and is_day_period and hour < 13: if day_period.lower() in PM_STRINGS: if hour != 12: hour += 12 elif hour == 12: hour = 0 start_index = match.start() end_index = match.end() # Now try dates if date_format == 'ymd' or not date_format: match = ISO_8601_DATE_PATTERN.search(text) if match: date_ok = True year = int(match.group(1)) month = int(match.group(2)) day = int(match.group(3)) start_index = min(start_index, match.start()) end_index = max(end_index, match.end()) if not date_ok and (date_format == 'mdy' or not date_format): match = MMM_DD_YY_PATTERN.search(text) if match: date_ok = True month_str = match.group(1) month = parse_month(month_str) day = int(match.group(2)) year_str = match.group(3) if year_str and len(year_str) == 4: year = int(year_str) else: ambiguous_year = True start_index = min(start_index, match.start()) end_index = max(end_index, match.end()) if not date_ok: match = NN_NN_NNNN_PATTERN.search(text) if match: date_ok = True num_1 = int(match.group(1)) num_2 = int(match.group(2)) year = int(match.group(3)) if year < 100: year = y2k(year) if date_format == 'mdy' or num_2 > 12: month = num_1 day = num_2 else: day = num_1 month = num_2 start_index = min(start_index, match.start()) end_index = max(end_index, match.end()) if date_ok: guess_date = datetime.datetime(year, month, day, hour, minute, second, tzinfo=datetime.timezone.utc) if ambiguous_year and guess_date > datetime_now: # Sometimes year is not shown within 6 months # Year is shown for dates in the future guess_date = guess_date.replace(year=year - 1) return guess_date, start_index, end_index else: raise ValueError('Failed to parse date from {}'.format(repr(text)))
[docs]def parse_month(text: str) -> int: '''Parse month string into integer.''' text = text.lower() try: return MONTH_MAP[text] except KeyError: pass try: return MONTH_MAP[text[:3]] except KeyError: pass raise ValueError('Month {} not found.'.format(repr(text)))
[docs]def y2k(year: int) -> int: '''Convert two digit year to four digit year.''' assert 0 <= year <= 99, 'Not a two digit year {}'.format(year) return year + 1000 if year >= 69 else year + 2000
DEFAULT_LANGUAGE_CODES = ( 'zh', 'es', 'en', 'hi', 'ar', 'pt', 'ru', 'ja', 'de', 'fr', 'pl', )
[docs]def parse_cldr_json(directory, language_codes=DEFAULT_LANGUAGE_CODES, massage=True): '''Parse CLDR JSON datasets to for date time things.''' am_strings = set() pm_strings = set() month_to_int = {} for lang in language_codes: path = os.path.join(directory, 'main', lang, 'ca-gregorian.json') with open(path) as in_file: doc = json.load(in_file) months_dict = doc['main'][lang]['dates']['calendars']['gregorian']['months']['format']['abbreviated'] day_periods_dict = doc['main'][lang]['dates']['calendars']['gregorian']['dayPeriods']['format']['abbreviated'] for month, month_str in months_dict.items(): if massage: month_str = unicodedata.normalize('NFKD', month_str).lower().strip('.') month_to_int[month_str] = int(month) am_str = day_periods_dict['am'] pm_str = day_periods_dict['pm'] if massage: am_str = unicodedata.normalize('NFKD', am_str).lower().strip('.') pm_str = unicodedata.normalize('NFKD', pm_str).lower().strip('.') am_strings.add(am_str) pm_strings.add(pm_str) print(pprint.pformat(am_strings)) print(pprint.pformat(pm_strings)) print(pprint.pformat(month_to_int))
if __name__ == '__main__': parse_cldr_json(sys.argv[1])