#!/usr/bin/env python
from collections import defaultdict
import json
from pkg_resources import resource_stream
import pytz
import re
class _InternTable(dict):
"""
A callable which returns a value equal to its argument, but if it's called
twice with equivalent values, it always returns the object that was passed
to it first. As long as the objects are immutable, this saves memory
without changing the behavior of the program.
"""
def __call__(self, v):
return self.setdefault(v, v)
[docs]class TimeLocaleSet(object):
"""
Structured information about how a set of locales express dates and times.
"""
[docs] @classmethod
def from_json(cls, f):
"""
Load a locale set from a JSON-formatted stream, such as one produced by
``utils/lc_time``.
:return: the loaded locale set
"""
return cls(**json.load(f))
[docs] @classmethod
def default(cls, provider="glibc"):
"""
Load a locale set that was distributed with this package. See
``percentagent/locales/`` for the available sets.
:return: the loaded locale set
"""
path = "locales/{}.json".format(provider)
with resource_stream(__name__, path) as f:
return cls.from_json(f)
@classmethod
def _localized_conversion(cls, uniq, keywords, fmt, offset, d):
for v, locales in d.items():
for value, word in enumerate(v.split(";"), offset):
keywords[word.strip().casefold()][fmt, value].update(map(uniq, locales))
_equivalents = {
'e': 'd',
'I': 'H',
'k': 'H',
'l': 'H',
'Y': 'y',
}
# Some patterns are common across so many locales that they are useless for
# guessing which locale the input came from, and should just be allowed for all
# locales.
_global_prefixes = (
(":", "MS"),
("/", "Cymd"),
("-", "Cymd"),
("utc", "z"), # "UTC+hhmm"
("t", "H"), # ISO 8601: ...%dT%H...
)
_global_suffixes = (
(":", "HM"),
("/", "ymd"),
("-", "ymd"),
("t", "d"), # ISO 8601: ...%dT%H...
)
_merge_patterns = (
(("p", 0), ("am", "a.m.")),
(("p", 1), ("pm", "p.m.")),
)
# These symbols never provide semantic information about neighboring conversion
# specifiers.
_ignore = ('['
# whitespace and right-to-left markers
"\\s\u202b\u202c"
# parens and dot
"()."
# various kinds of https://en.wikipedia.org/wiki/Comma
",\xb7\u055d\u060c\u07f8\u1363\u1802\u1808\u2e41\u2e4c\u3001\ua4fe\ua60d\ua6f5\uff0c"
']*')
_fmt_token = re.compile(_ignore + r'%[-_0^#]?\d*[EO]?([a-zA-Z+%])' + _ignore)
"""
A compiled regular expression to match :manpage:`strftime(3)`-style
conversion specifiers. This regex contains a single group which returns the
final conversion specifier character, skipping any flags, field widths, or
modifiers. The :py:meth:`~re.Pattern.findall` method will return a list of
just the conversion specifier characters; the :py:meth:`~re.Pattern.split`
method will return the same but alternating with non-conversion text.
"""
def __init__(self, formats=None, day=None, mon=None, am_pm=None, alt_digits=None, era=None):
"""
All parameters are dictionaries which map a string to a set of locales
in which that string is used.
Except for :py:obj:`formats`, the dictionary keys are
semicolon-separated (``;``) ordered lists. Their semantics are
documented in :manpage:`locale(5)`.
:param formats: Sample :manpage:`strftime(3)` format strings to extract
prefix and suffix patterns from.
:param day: Names of days of the week.
:param mon: Names of months.
:param am_pm: Strings indicating times before or after noon.
:param alt_digits: Numbers from writing systems which do not use
Unicode digits.
:param era: Definitions of how years are counted and displayed.
"""
uniqlocales = _InternTable()
uniqlocalesets = _InternTable()
keywords = defaultdict(lambda: defaultdict(set))
self._localized_conversion(uniqlocales, keywords, "a", 0, day or {})
self._localized_conversion(uniqlocales, keywords, "b", 1, mon or {})
self._localized_conversion(uniqlocales, keywords, "p", 0, am_pm or {})
self._localized_conversion(uniqlocales, keywords, "O", 0, alt_digits or {})
for fmt, merges in self._merge_patterns:
merged = set.union(*(keywords[pattern][fmt] for pattern in merges))
for pattern in merges:
keywords[pattern][fmt] = merged
for timezone in pytz.all_timezones:
tz = pytz.timezone(timezone)
if hasattr(tz, "_transition_info"):
shortnames = set(tzname for _, _, tzname in tz._transition_info)
else:
shortnames = [tz._tzname]
for tzname in shortnames:
if tzname[0] not in "+-":
keywords[tzname.casefold()]["Z", tzname] = frozenset()
self._keywords = {
pattern: tuple(
(fmt, value, uniqlocalesets(tuple(sorted(locales))))
for (fmt, value), locales in fmts.items()
)
for pattern, fmts in keywords.items()
}
prefixes = defaultdict(lambda: defaultdict(set))
suffixes = defaultdict(lambda: defaultdict(set))
# TODO: extract patterns from era
for v, locales in (formats or {}).items():
tokens = iter(self._fmt_token.split(v))
prefix = next(tokens)
for fmt, suffix in zip(tokens, tokens):
# We don't need to look at surrounding context to recognize the
# names of weekdays, months, or morning/afternoon.
if fmt.lower() not in "abp":
fmt = self._equivalents.get(fmt, fmt)
if prefix != '':
prefixes[prefix.casefold()][fmt].update(map(uniqlocales, locales))
if suffix != '':
suffixes[suffix.casefold()][fmt].update(map(uniqlocales, locales))
# This conversion's suffix is the next conversion's prefix.
prefix = suffix
for pattern, fmts in self._global_prefixes:
prefixes[pattern] = dict.fromkeys(fmts, frozenset())
for pattern, fmts in self._global_suffixes:
suffixes[pattern] = dict.fromkeys(fmts, frozenset())
self._prefixes = {
pattern: tuple(
(fmt, uniqlocalesets(tuple(sorted(locales))))
for fmt, locales in fmts.items()
)
for pattern, fmts in prefixes.items()
}
self._suffixes = {
pattern: tuple(
(fmt, uniqlocalesets(tuple(sorted(locales))))
for fmt, locales in fmts.items()
)
for pattern, fmts in suffixes.items()
}
@property
def keywords(self):
"""
Group conversion specifiers by the non-numeric strings they can
produce. This includes these specifiers:
- Weekday names: ``%a``
- Month names: ``%b``
- AM/PM: ``%p``
- Timezone abbreviations: ``%Z``
- Non-decimal numbers: ``%O`` prefix (e.g. ``%Om`` for months)
>>> glibc = TimeLocaleSet.default('glibc').keywords
Many strings can only be produced by a single conversion specifier in a
single locale. For example, according to the glibc locale database,
"Agustus" is the ``id_ID`` (Indonesian) word for the 8th month, and
does not appear in any other locale.
>>> sorted(glibc['agustus'])
[('b', 8, ('id_ID',))]
However, other strings can be ambiguous. For example, "Ahad" is the
word for Sunday in ``ms_MY`` (the Malay language locale for Malaysia),
but the word for Wednesday in ``kab_DZ`` (the Kabyle language locale
for Algeria). These languages are from entirely different language
families but we can't tell them apart if all we see is this one word.
However, in either case we do know that the word refers to a weekday.
>>> sorted(glibc['ahad'])
[('a', 0, ('ms_MY',)), ('a', 3, ('kab_DZ',))]
Sometimes, without context, we can't even tell which role a word plays.
"An" is the word for Tuesday in ``lt_LT`` (Lithuanian), but hours
before noon are distinguished with "AN" in ``ak_GH`` (the Akan locale
for Ghana).
>>> sorted(glibc['an'])
[('a', 2, ('lt_LT',)), ('p', 0, ('ak_GH',))]
Similarly, "AWST" is the timezone abbreviation for Australian Western
Standard Time, while "Awst" is the ``cy_GB`` (Welsh) word for the 8th
month.
>>> sorted(glibc['awst'])
[('Z', 'AWST', ()), ('b', 8, ('cy_GB',))]
Finally, in Chinese, Monday through Saturday are abbreviated using the
numbers 1-6, and those numbers are written using the same characters in
Japanese. So if we see those numbers, they could either be from numeric
conversions such as ``%Od``, or from the abbreviated weekday
conversion, ``%a``.
>>> sorted(glibc['一'])
[('O', 1, ('ja_JP', 'lzh_TW')), ('a', 1, ('cmn_TW', 'hak_TW', 'lzh_TW', 'nan_TW', 'yue_HK', 'zh_CN', 'zh_HK', 'zh_SG', 'zh_TW'))]
"""
return self._keywords
@property
def prefixes(self):
"""
Group conversion specifiers by the strings which may precede them.
>>> glibc = TimeLocaleSet.default('glibc').prefixes
In ``vi_VN`` (Vietnamese), "tháng" means "month", and "năm" means
"year". Within the glibc locale database, we find that these words are
used as prefix to the numeric value in question:
>>> sorted(glibc['tháng'])
[('m', ('vi_VN',))]
>>> sorted(glibc['năm'])
[('y', ('vi_VN',))]
"""
return self._prefixes
@property
def suffixes(self):
"""
Group conversion specifiers by the strings which may follow them.
>>> suffixes = TimeLocaleSet(formats={
... '%a, %Y.eko %bren %da': {'eu_ES'},
... '%Y年%m月%d日': {'ja_JP'},
... }).suffixes
In ``eu_ES`` (the Basque locale for Spain), year/month/day are followed
by "eko", "ren", and "a", respectively. However, in our sample format
string, "ren" follows ``%b``, which is the name of a month, not its
number. So we don't extract it as a suffix; we rely on month names
being sufficiently distinctive instead.
>>> sorted(suffixes['eko'])
[('y', ('eu_ES',))]
>>> 'ren' in suffixes
False
>>> sorted(suffixes['a'])
[('d', ('eu_ES',))]
In ``ja_JP`` (Japanese), year/month/day are followed by "年", "月", and
"日", respectively. Since our sample format string uses only numeric
conversion specifiers, we extract all three as valid suffixes for their
corresponding conversions.
>>> sorted(suffixes['年'])
[('y', ('ja_JP',))]
>>> sorted(suffixes['月'])
[('m', ('ja_JP',))]
>>> sorted(suffixes['日'])
[('d', ('ja_JP',))]
"""
return self._suffixes
if __name__ == "__main__":
locale_set = TimeLocaleSet.default()
for pattern, fmts in sorted(locale_set.keywords.items()):
print("{!r}:".format(pattern))
for fmt, value, locales in sorted(fmts):
print("- %{}={}: {}".format(fmt, value, ' '.join(sorted(locales))))
print()
for pattern, fmts in sorted(locale_set.prefixes.items()):
print("{!r}:".format(pattern))
for fmt, locales in sorted(fmts):
print("- # %{}: {}".format(fmt, ' '.join(sorted(locales))))
print()
for pattern, fmts in sorted(locale_set.suffixes.items()):
print("{!r}:".format(pattern))
for fmt, locales in sorted(fmts):
print("- %{} #: {}".format(fmt, ' '.join(sorted(locales))))
print()