Source code for percentagent.extract_patterns

#!/usr/bin/env python

from collections import defaultdict
import json
from pkg_resources import resource_stream
import pytz
import re

class _InternTable(dict):
    """
    A callable which returns a value equal to its argument, but if it's called
    twice with equivalent values, it always returns the object that was passed
    to it first. As long as the objects are immutable, this saves memory
    without changing the behavior of the program.
    """

    def __call__(self, v):
        return self.setdefault(v, v)

[docs]class TimeLocaleSet(object):
    """
    Structured information about how a set of locales express dates and times.
    """

[docs]    @classmethod
    def from_json(cls, f):
        """
        Load a locale set from a JSON-formatted stream, such as one produced by
        ``utils/lc_time``.

        :return: the loaded locale set
        """

        return cls(**json.load(f))

[docs]    @classmethod
    def default(cls, provider="glibc"):
        """
        Load a locale set that was distributed with this package. See
        ``percentagent/locales/`` for the available sets.

        :return: the loaded locale set
        """

        path = "locales/{}.json".format(provider)
        with resource_stream(__name__, path) as f:
            return cls.from_json(f)

    @classmethod
    def _localized_conversion(cls, uniq, keywords, fmt, offset, d):
        for v, locales in d.items():
            for value, word in enumerate(v.split(";"), offset):
                keywords[word.strip().casefold()][fmt, value].update(map(uniq, locales))

    _equivalents = {
        'e': 'd',
        'I': 'H',
        'k': 'H',
        'l': 'H',
        'Y': 'y',
    }

    # Some patterns are common across so many locales that they are useless for
    # guessing which locale the input came from, and should just be allowed for all
    # locales.
    _global_prefixes = (
        (":", "MS"),
        ("/", "Cymd"),
        ("-", "Cymd"),
        ("utc", "z"), # "UTC+hhmm"
        ("t", "H"), # ISO 8601: ...%dT%H...
    )
    _global_suffixes = (
        (":", "HM"),
        ("/", "ymd"),
        ("-", "ymd"),
        ("t", "d"), # ISO 8601: ...%dT%H...
    )

    _merge_patterns = (
        (("p", 0), ("am", "a.m.")),
        (("p", 1), ("pm", "p.m.")),
    )

    # These symbols never provide semantic information about neighboring conversion
    # specifiers.
    _ignore = ('['
            # whitespace and right-to-left markers
            "\\s\u202b\u202c"
            # parens and dot
            "()."
            # various kinds of https://en.wikipedia.org/wiki/Comma
            ",\xb7\u055d\u060c\u07f8\u1363\u1802\u1808\u2e41\u2e4c\u3001\ua4fe\ua60d\ua6f5\uff0c"
            ']*')

    _fmt_token = re.compile(_ignore + r'%[-_0^#]?\d*[EO]?([a-zA-Z+%])' + _ignore)
    """
    A compiled regular expression to match :manpage:`strftime(3)`-style
    conversion specifiers. This regex contains a single group which returns the
    final conversion specifier character, skipping any flags, field widths, or
    modifiers. The :py:meth:`~re.Pattern.findall` method will return a list of
    just the conversion specifier characters; the :py:meth:`~re.Pattern.split`
    method will return the same but alternating with non-conversion text.
    """

    def __init__(self, formats=None, day=None, mon=None, am_pm=None, alt_digits=None, era=None):
        """
        All parameters are dictionaries which map a string to a set of locales
        in which that string is used.

        Except for :py:obj:`formats`, the dictionary keys are
        semicolon-separated (``;``) ordered lists. Their semantics are
        documented in :manpage:`locale(5)`.

        :param formats: Sample :manpage:`strftime(3)` format strings to extract
            prefix and suffix patterns from.
        :param day: Names of days of the week.
        :param mon: Names of months.
        :param am_pm: Strings indicating times before or after noon.
        :param alt_digits: Numbers from writing systems which do not use
            Unicode digits.
        :param era: Definitions of how years are counted and displayed.
        """

        uniqlocales = _InternTable()
        uniqlocalesets = _InternTable()

        keywords = defaultdict(lambda: defaultdict(set))
        self._localized_conversion(uniqlocales, keywords, "a", 0, day or {})
        self._localized_conversion(uniqlocales, keywords, "b", 1, mon or {})
        self._localized_conversion(uniqlocales, keywords, "p", 0, am_pm or {})
        self._localized_conversion(uniqlocales, keywords, "O", 0, alt_digits or {})

        for fmt, merges in self._merge_patterns:
            merged = set.union(*(keywords[pattern][fmt] for pattern in merges))
            for pattern in merges:
                keywords[pattern][fmt] = merged

        for timezone in pytz.all_timezones:
            tz = pytz.timezone(timezone)
            if hasattr(tz, "_transition_info"):
                shortnames = set(tzname for _, _, tzname in tz._transition_info)
            else:
                shortnames = [tz._tzname]
            for tzname in shortnames:
                if tzname[0] not in "+-":
                    keywords[tzname.casefold()]["Z", tzname] = frozenset()

        self._keywords = {
            pattern: tuple(
                (fmt, value, uniqlocalesets(tuple(sorted(locales))))
                for (fmt, value), locales in fmts.items()
            )
            for pattern, fmts in keywords.items()
        }

        prefixes = defaultdict(lambda: defaultdict(set))
        suffixes = defaultdict(lambda: defaultdict(set))

        # TODO: extract patterns from era

        for v, locales in (formats or {}).items():
            tokens = iter(self._fmt_token.split(v))
            prefix = next(tokens)
            for fmt, suffix in zip(tokens, tokens):
                # We don't need to look at surrounding context to recognize the
                # names of weekdays, months, or morning/afternoon.
                if fmt.lower() not in "abp":
                    fmt = self._equivalents.get(fmt, fmt)
                    if prefix != '':
                        prefixes[prefix.casefold()][fmt].update(map(uniqlocales, locales))
                    if suffix != '':
                        suffixes[suffix.casefold()][fmt].update(map(uniqlocales, locales))

                # This conversion's suffix is the next conversion's prefix.
                prefix = suffix

        for pattern, fmts in self._global_prefixes:
            prefixes[pattern] = dict.fromkeys(fmts, frozenset())

        for pattern, fmts in self._global_suffixes:
            suffixes[pattern] = dict.fromkeys(fmts, frozenset())

        self._prefixes = {
            pattern: tuple(
                (fmt, uniqlocalesets(tuple(sorted(locales))))
                for fmt, locales in fmts.items()
            )
            for pattern, fmts in prefixes.items()
        }

        self._suffixes = {
            pattern: tuple(
                (fmt, uniqlocalesets(tuple(sorted(locales))))
                for fmt, locales in fmts.items()
            )
            for pattern, fmts in suffixes.items()
        }

    @property
    def keywords(self):
        """
        Group conversion specifiers by the non-numeric strings they can
        produce. This includes these specifiers:

        - Weekday names: ``%a``
        - Month names: ``%b``
        - AM/PM: ``%p``
        - Timezone abbreviations: ``%Z``
        - Non-decimal numbers: ``%O`` prefix (e.g. ``%Om`` for months)

        >>> glibc = TimeLocaleSet.default('glibc').keywords

        Many strings can only be produced by a single conversion specifier in a
        single locale. For example, according to the glibc locale database,
        "Agustus" is the ``id_ID`` (Indonesian) word for the 8th month, and
        does not appear in any other locale.

        >>> sorted(glibc['agustus'])
        [('b', 8, ('id_ID',))]

        However, other strings can be ambiguous. For example, "Ahad" is the
        word for Sunday in ``ms_MY`` (the Malay language locale for Malaysia),
        but the word for Wednesday in ``kab_DZ`` (the Kabyle language locale
        for Algeria). These languages are from entirely different language
        families but we can't tell them apart if all we see is this one word.
        However, in either case we do know that the word refers to a weekday.

        >>> sorted(glibc['ahad'])
        [('a', 0, ('ms_MY',)), ('a', 3, ('kab_DZ',))]

        Sometimes, without context, we can't even tell which role a word plays.
        "An" is the word for Tuesday in ``lt_LT`` (Lithuanian), but hours
        before noon are distinguished with "AN" in ``ak_GH`` (the Akan locale
        for Ghana).

        >>> sorted(glibc['an'])
        [('a', 2, ('lt_LT',)), ('p', 0, ('ak_GH',))]

        Similarly, "AWST" is the timezone abbreviation for Australian Western
        Standard Time, while "Awst" is the ``cy_GB`` (Welsh) word for the 8th
        month.

        >>> sorted(glibc['awst'])
        [('Z', 'AWST', ()), ('b', 8, ('cy_GB',))]

        Finally, in Chinese, Monday through Saturday are abbreviated using the
        numbers 1-6, and those numbers are written using the same characters in
        Japanese. So if we see those numbers, they could either be from numeric
        conversions such as ``%Od``, or from the abbreviated weekday
        conversion, ``%a``.

        >>> sorted(glibc['一'])
        [('O', 1, ('ja_JP', 'lzh_TW')), ('a', 1, ('cmn_TW', 'hak_TW', 'lzh_TW', 'nan_TW', 'yue_HK', 'zh_CN', 'zh_HK', 'zh_SG', 'zh_TW'))]
        """
        return self._keywords

    @property
    def prefixes(self):
        """
        Group conversion specifiers by the strings which may precede them.

        >>> glibc = TimeLocaleSet.default('glibc').prefixes

        In ``vi_VN`` (Vietnamese), "tháng" means "month", and "năm" means
        "year". Within the glibc locale database, we find that these words are
        used as prefix to the numeric value in question:

        >>> sorted(glibc['tháng'])
        [('m', ('vi_VN',))]
        >>> sorted(glibc['năm'])
        [('y', ('vi_VN',))]
        """
        return self._prefixes

    @property
    def suffixes(self):
        """
        Group conversion specifiers by the strings which may follow them.

        >>> suffixes = TimeLocaleSet(formats={
        ...     '%a, %Y.eko %bren %da': {'eu_ES'},
        ...     '%Y年%m月%d日': {'ja_JP'},
        ... }).suffixes

        In ``eu_ES`` (the Basque locale for Spain), year/month/day are followed
        by "eko", "ren", and "a", respectively. However, in our sample format
        string, "ren" follows ``%b``, which is the name of a month, not its
        number. So we don't extract it as a suffix; we rely on month names
        being sufficiently distinctive instead.

        >>> sorted(suffixes['eko'])
        [('y', ('eu_ES',))]
        >>> 'ren' in suffixes
        False
        >>> sorted(suffixes['a'])
        [('d', ('eu_ES',))]

        In ``ja_JP`` (Japanese), year/month/day are followed by "年", "月", and
        "日", respectively. Since our sample format string uses only numeric
        conversion specifiers, we extract all three as valid suffixes for their
        corresponding conversions.

        >>> sorted(suffixes['年'])
        [('y', ('ja_JP',))]
        >>> sorted(suffixes['月'])
        [('m', ('ja_JP',))]
        >>> sorted(suffixes['日'])
        [('d', ('ja_JP',))]
        """
        return self._suffixes

if __name__ == "__main__":
    locale_set = TimeLocaleSet.default()

    for pattern, fmts in sorted(locale_set.keywords.items()):
        print("{!r}:".format(pattern))
        for fmt, value, locales in sorted(fmts):
            print("- %{}={}: {}".format(fmt, value, ' '.join(sorted(locales))))
        print()

    for pattern, fmts in sorted(locale_set.prefixes.items()):
        print("{!r}:".format(pattern))
        for fmt, locales in sorted(fmts):
            print("- # %{}: {}".format(fmt, ' '.join(sorted(locales))))
        print()

    for pattern, fmts in sorted(locale_set.suffixes.items()):
        print("{!r}:".format(pattern))
        for fmt, locales in sorted(fmts):
            print("- %{} #: {}".format(fmt, ' '.join(sorted(locales))))
        print()