gramps/gramps2/src/DateParser.py

#coding: utf-8
# Gramps - a GTK+/GNOME based genealogy program
#
# Copyright (C) 2004  Donald N. Allingham
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#

# $Id$

"""
U.S. English date parsing class. Serves as the base class for any localized
date parsing class.
"""

__author__ = "Donald N. Allingham"
__version__ = "$Revision$"

#-------------------------------------------------------------------------
#
# Python modules
#
#-------------------------------------------------------------------------
import re
import time
import locale

#-------------------------------------------------------------------------
#
# GRAMPS modules
#
#-------------------------------------------------------------------------
import Date

#-------------------------------------------------------------------------
#
# Top-level module functions
#
#-------------------------------------------------------------------------
_max_days = [ 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ]

def gregorian_valid(date_tuple):
    day = date_tuple[0]
    month = date_tuple[1]
    valid = True
    try:
        if month > 12:
            valid = False
        elif day > _max_days[month-1]:
            valid = False
    except:
        valid = False
    return valid

#-------------------------------------------------------------------------
#
# Parser class
#
#-------------------------------------------------------------------------
class DateParser:
    """
    Converts a text string into a Date object. If the date cannot be
    converted, the text string is assigned.
    """

    # determine the code set returned by nl_langinfo
    _codeset = locale.nl_langinfo(locale.CODESET)
    _fmt_parse = re.compile(".*%(\S).*%(\S).*%(\S).*")

    # RFC-2822 only uses capitalized English abbreviated names, no locales.
    _rfc_days = ('Sun','Mon','Tue','Wed','Thu','Fri','Sat')
    _rfc_mons_to_int = {
        'Jan' : 1,
        'Feb' : 2,
        'Mar' : 3,
        'Apr' : 4,
        'May' : 5,
        'Jun' : 6,
        'Jul' : 7,
        'Aug' : 8,
        'Sep' : 9,
        'Oct' : 10,
        'Nov' : 11,
        'Dec' : 12,
        }

    month_to_int = {
        unicode(locale.nl_langinfo(locale.MON_1),_codeset).lower()   : 1,
        unicode(locale.nl_langinfo(locale.ABMON_1),_codeset).lower() : 1,
        unicode(locale.nl_langinfo(locale.MON_2),_codeset).lower()   : 2,
        unicode(locale.nl_langinfo(locale.ABMON_2),_codeset).lower() : 2,
        unicode(locale.nl_langinfo(locale.MON_3),_codeset).lower()   : 3,
        unicode(locale.nl_langinfo(locale.ABMON_3),_codeset).lower() : 3,
        unicode(locale.nl_langinfo(locale.MON_4),_codeset).lower()   : 4,
        unicode(locale.nl_langinfo(locale.ABMON_4),_codeset).lower() : 4,
        unicode(locale.nl_langinfo(locale.MON_5),_codeset).lower()   : 5,
        unicode(locale.nl_langinfo(locale.ABMON_5),_codeset).lower() : 5,
        unicode(locale.nl_langinfo(locale.MON_6),_codeset).lower()   : 6,
        unicode(locale.nl_langinfo(locale.ABMON_6),_codeset).lower() : 6,
        unicode(locale.nl_langinfo(locale.MON_7),_codeset).lower()   : 7,
        unicode(locale.nl_langinfo(locale.ABMON_7),_codeset).lower() : 7,
        unicode(locale.nl_langinfo(locale.MON_8),_codeset).lower()   : 8,
        unicode(locale.nl_langinfo(locale.ABMON_8),_codeset).lower() : 8,
        unicode(locale.nl_langinfo(locale.MON_9),_codeset).lower()   : 9,
        unicode(locale.nl_langinfo(locale.ABMON_9),_codeset).lower() : 9,
        unicode(locale.nl_langinfo(locale.MON_10),_codeset).lower()  : 10,
        unicode(locale.nl_langinfo(locale.ABMON_10),_codeset).lower(): 10,
        unicode(locale.nl_langinfo(locale.MON_11),_codeset).lower()  : 11,
        unicode(locale.nl_langinfo(locale.ABMON_11),_codeset).lower(): 11,
        unicode(locale.nl_langinfo(locale.MON_12),_codeset).lower()  : 12,
        unicode(locale.nl_langinfo(locale.ABMON_12),_codeset).lower(): 12,
       }

    modifier_to_int = {
        'before' : Date.MOD_BEFORE, 'bef'    : Date.MOD_BEFORE,
        'bef.'   : Date.MOD_BEFORE, 'after'  : Date.MOD_AFTER,
        'aft'    : Date.MOD_AFTER,  'aft.'   : Date.MOD_AFTER,
        'about'  : Date.MOD_ABOUT,  'abt.'   : Date.MOD_ABOUT,
        'abt'    : Date.MOD_ABOUT,  'circa'  : Date.MOD_ABOUT,
        'c.'     : Date.MOD_ABOUT,  'around' : Date.MOD_ABOUT,
        }

    hebrew_to_int = {
        "tishri"  : 1,   "heshvan" : 2,   "kislev"  : 3,
        "tevet"   : 4,   "shevat"  : 5,   "adari"   : 6,
        "adarii"  : 7,   "nisan"   : 8,   "iyyar"   : 9,
        "sivan"   : 10,  "tammuz"  : 11,  "av"      : 12,
        "elul"    : 13,
        }

    french_to_int = {
        u'vend\xe9miaire' : 1,  u'brumaire'   : 2,
        u'frimaire'       : 3,  u'niv\xf4se  ': 4,
        u'pluvi\xf4se'    : 5,  u'vent\xf4se' : 6,
        u'germinal'       : 7,  u'flor\xe9al' : 8,
        u'prairial'       : 9,  u'messidor'   : 10,
        u'thermidor'      : 11, u'fructidor'  : 12,
        u'extra'          : 13
        }

    islamic_to_int = {
        "muharram"           : 1,  "muharram ul haram"  : 1,
        "safar"              : 2,  "rabi`al-awwal"      : 3,
        "rabi'l"             : 3,  "rabi`ul-akhir"      : 4,
        "rabi`ath-thani"     : 4,  "rabi` ath-thani"    : 4,
        "rabi`al-thaany"     : 4,  "rabi` al-thaany"    : 4,
        "rabi' ii"           : 4,  "jumada l-ula"       : 5,
        "jumaada-ul-awwal"   : 5,  "jumaada i"          : 5,
        "jumada t-tania"     : 6,  "jumaada-ul-akhir"   : 6,
        "jumaada al-thaany"  : 6,  "jumaada ii"         : 5,
        "rajab"              : 7,  "sha`ban"            : 8,
        "sha`aban"           : 8,  "ramadan"            : 9,
        "ramadhan"           : 9,  "shawwal"            : 10,
        "dhu l-qa`da"        : 11, "dhu qadah"          : 11,
        "thw al-qi`dah"      : 11, "dhu l-hijja"        : 12,
        "dhu hijja"          : 12, "thw al-hijjah"      : 12,
        }

    persian_to_int = {
        "Farvardin"   : 1,  "Ordibehesht" : 2,
        "Khordad"     : 3,  "Tir"         : 4,
        "Mordad"      : 5,  "Shahrivar"   : 6,
        "Mehr"        : 7,  "Aban"        : 8,
        "Azar"        : 9,  "Dey"         : 10,
        "Bahman"      : 11, "Esfand"      : 12,
        }

    bce = ["BC", "B\.C", "B\.C\.", "BCE", "B\.C\.E", "B\.C\.E"]

    calendar_to_int = {
        'gregorian'        : Date.CAL_GREGORIAN,
        'g'                : Date.CAL_GREGORIAN,
        'julian'           : Date.CAL_JULIAN,
        'j'                : Date.CAL_JULIAN,
        'hebrew'           : Date.CAL_HEBREW,
        'h'                : Date.CAL_HEBREW,
        'islamic'          : Date.CAL_ISLAMIC,
        'i'                : Date.CAL_ISLAMIC,
        'french'           : Date.CAL_FRENCH,
        'french republican': Date.CAL_FRENCH,
        'f'                : Date.CAL_FRENCH,
        'persian'          : Date.CAL_PERSIAN,
        'p'                : Date.CAL_PERSIAN,
        }

    quality_to_int = {
        'estimated'  : Date.QUAL_ESTIMATED,
        'est.'       : Date.QUAL_ESTIMATED,
        'est'        : Date.QUAL_ESTIMATED,
        'calc.'      : Date.QUAL_CALCULATED,
        'calc'       : Date.QUAL_CALCULATED,
        'calculated' : Date.QUAL_CALCULATED,
        }

    _rfc_mon_str  = '(' + '|'.join(_rfc_mons_to_int.keys()) + ')'
    _rfc_day_str  = '(' + '|'.join(_rfc_days) + ')'

    _bce_str = '(' + '|'.join(bce) + ')'

    _qual_str = '(' + '|'.join(
        [ key.replace('.','\.') for key in quality_to_int.keys() ]
        ) + ')'
    _mod_str  = '(' + '|'.join(
        [ key.replace('.','\.') for key in modifier_to_int.keys() ]
        ) + ')'
    _mon_str  = '(' + '|'.join(month_to_int.keys()) + ')'
    _jmon_str = '(' + '|'.join(hebrew_to_int.keys()) + ')'
    _fmon_str = '(' + '|'.join(french_to_int.keys()) + ')'
    _pmon_str = '(' + '|'.join(persian_to_int.keys()) + ')'
    _cal_str  = '(' + '|'.join(calendar_to_int.keys()) + ')'
    _imon_str = '(' + '|'.join(islamic_to_int.keys()) + ')'

    _bce_re   = re.compile("(.+)\s+%s" % _bce_str)

    _cal      = re.compile("(.+)\s\(%s\)" % _cal_str,
                           re.IGNORECASE)
    _qual     = re.compile("%s\s+(.+)" % _qual_str,
                           re.IGNORECASE)
    _span     = re.compile("from\s+(.+)\s+to\s+(.+)",
                           re.IGNORECASE)
    _range    = re.compile("(bet.|between)\s+(.+)\s+and\s+(.+)",
                           re.IGNORECASE)
    _modifier = re.compile('%s\s+(.*)' % _mod_str,
                           re.IGNORECASE)
    _text     = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _mon_str,
                           re.IGNORECASE)
    _text2    = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _mon_str,
                           re.IGNORECASE)
    _jtext    = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _jmon_str,
                           re.IGNORECASE)
    _jtext2   = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _jmon_str,
                           re.IGNORECASE)
    _ftext    = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _fmon_str,
                           re.IGNORECASE)
    _ftext2   = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _fmon_str,
                           re.IGNORECASE)
    _ptext    = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _pmon_str,
                           re.IGNORECASE)
    _ptext2   = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _pmon_str,
                           re.IGNORECASE)
    _itext    = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _imon_str,
                           re.IGNORECASE)
    _itext2   = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _imon_str,
                           re.IGNORECASE)
    _range2   = re.compile('%s\s+(\d+)-(\d+)\s*,?\s*((\d+)(/\d+)?)?' % _mon_str,
                           re.IGNORECASE)
    _numeric  = re.compile("((\d+)[/\.])?((\d+)[/\.])?(\d+)")
    _iso      = re.compile("(\d+)-(\d+)-(\d+)")
    _rfc      = re.compile("(%s,)?\s+(\d|\d\d)\s+%s\s+(\d+)\s+\d\d:\d\d(:\d\d)?\s+(\+|-)\d\d\d\d"
                        % (_rfc_day_str,_rfc_mon_str))


    def __init__(self):
        self.parser = {
            Date.CAL_GREGORIAN : self._parse_greg_julian,
            Date.CAL_JULIAN    : self._parse_greg_julian,
            Date.CAL_PERSIAN   : self._parse_persian,
            Date.CAL_HEBREW    : self._parse_hebrew,
            Date.CAL_ISLAMIC   : self._parse_islamic,
            }

        fmt = locale.nl_langinfo(locale.D_FMT)
        match = self._fmt_parse.match(fmt.lower())
        if match:
            self.dmy = (match.groups() == ('d','m','y'))
        else:
            self.dmy = True

    def _get_int(self,val):
        """
        Converts the string to an integer if the value is not None. If the
        value is None, a zero is returned
        """
        if val == None:
            return 0
        else:
            return int(val)

    def _parse_hebrew(self,text):
        return self._parse_calendar(text,self._jtext,self._jtext2,
                                    self.hebrew_to_int)

    def _parse_islamic(self,text):
        return self._parse_calendar(text,self._itext,self._itext2,
                                    self.islamic_to_int)

    def _parse_persian(self,text):
        return self._parse_calendar(text,self._ptext,self._ptext2,
                                    self.persian_to_int)

    def _parse_french(self,text):
        return self._parse_calendar(text,self._ftext,self._ftext2,
                                    self.french_to_int)

    def _parse_greg_julian(self,text):
        return self._parse_calendar(text,self._text,self._text2,
                                    self.month_to_int)

    def _parse_calendar(self,text,regex1,regex2,mmap):
        match = regex1.match(text)
        if match:
            groups = match.groups()
            if groups[0] == None:
                m = 0
            else:
                m = mmap[groups[0].lower()]

            if groups[2] == None:
                y = self._get_int(groups[1])
                d = 0
                s = None
            else:
                d = self._get_int(groups[1])
                y = int(groups[3])
                s = groups[4] != None
            return (d,m,y,s)

        match = regex2.match(text)
        if match:
            groups = match.groups()
            if groups[1] == None:
                m = 0
            else:
                m = mmap[groups[1].lower()]

            d = self._get_int(groups[0])

            if groups[2] == None:
                y = 0
                s = None
            else:
                y = int(groups[3])
                s = groups[4] != None
            return (d,m,y,s)
        return Date.EMPTY

    def _parse_subdate(self,text,subparser=None):
        """
        Converts only the date portion of a date.
        """
        if subparser == None:
            subparser = self._parse_greg_julian

        value = subparser(text)
        if value != Date.EMPTY:
            return value

        match = self._iso.match(text)
        if match:
            groups = match.groups()
            y = self._get_int(groups[0])
            m = self._get_int(groups[1])
            d = self._get_int(groups[2])
            if gregorian_valid((d,m)):
                return (d,m,y,False)
            else:
                return Date.EMPTY

        match = self._rfc.match(text)
        if match:
            groups = match.groups()
            d = self._get_int(groups[2])
            m = self._rfc_mons_to_int[groups[3]]
            y = self._get_int(groups[4])
            if gregorian_valid((d,m)):
                return (d,m,y,False)
            else:
                return Date.EMPTY

        match = self._numeric.match(text)
        if match:
            groups = match.groups()
            if self.dmy:
                m = self._get_int(groups[3])
                d = self._get_int(groups[1])
            else:
                m = self._get_int(groups[1])
                d = self._get_int(groups[3])
            y = self._get_int(groups[4])
            if gregorian_valid((d,m)):
                return (d,m,y,False)
            else:
                return Date.EMPTY

        return Date.EMPTY

    def set_date(self,date,text):
        """
        Parses the text and sets the date according to the parsing.
        """
        date.set_text_value(text)
        qual = Date.QUAL_NONE
        cal  = Date.CAL_GREGORIAN

        match = self._cal.match(text)
        if match:
            grps = match.groups()
            cal = self.calendar_to_int[grps[1].lower()]
            text = grps[0]

        text_parser = self.parser[cal]

        match = self._qual.match(text)
        if match:
            grps = match.groups()
            qual = self.quality_to_int[grps[0].lower()]
            text = grps[1]

        match = self._span.match(text)
        if match:
            grps = match.groups()
            start = self._parse_subdate(grps[0],text_parser)
            stop = self._parse_subdate(grps[1],text_parser)
            date.set(qual,Date.MOD_SPAN,cal,start + stop)
            return

        match = self._range.match(text)
        if match:
            grps = match.groups()
            start = self._parse_subdate(grps[1],text_parser)
            stop = self._parse_subdate(grps[2],text_parser)
            date.set(qual,Date.MOD_RANGE,cal,start + stop)
            return

        match = self._range2.match(text)
        if match:
            grps = match.groups()
            m = self.month_to_int[grps[0].lower()]

            d0 = self._get_int(grps[1])
            d1 = self._get_int(grps[2])

            if grps[3] == None:
                y = 0
                s = None
            else:
                y = int(grps[3])
                s = grps[4] != None
            date.set(qual,Date.MOD_RANGE,Date.CAL_GREGORIAN,
                     (d0,m,y,s,d1,m,y,s))
            return

        match = self._bce_re.match(text)
        bc = False
        if match:
            text = match.groups()[0]
            bc = True

        match = self._modifier.match(text)
        if match:
            grps = match.groups()
            start = self._parse_subdate(grps[1])
            mod = self.modifier_to_int.get(grps[0].lower(),Date.MOD_NONE)
            if bc:
                date.set(qual,mod,cal,self.invert_year(start))
            else:
                date.set(qual,mod,cal,start)
            return

        subdate = self._parse_subdate(text)
        if subdate == Date.EMPTY:
            subdate = self._parse_hebrew(text)
            if subdate == Date.EMPTY:
                subdate = self._parse_persian(text)
                if subdate == Date.EMPTY:
                    subdate = self._parse_islamic(text)
                    if subdate == Date.EMPTY:
                        subdate = self._parse_french(text)
                        if subdate == Date.EMPTY:
                            date.set_as_text(text)
                            return
                        else:
                            cal = Date.CAL_FRENCH
                    else:
                        cal = Date.CAL_ISLAMIC
                else:
                    cal = Date.CAL_PERSIAN
            else:
                cal = Date.CAL_HEBREW

        if bc:
            date.set(qual,Date.MOD_NONE,cal,self.invert_year(subdate))
        else:
            date.set(qual,Date.MOD_NONE,cal,subdate)

    def invert_year(self,subdate):
        return (subdate[0],subdate[1],-subdate[2],subdate[3])

    def parse(self,text):
        """
        Parses the text, returning a Date object.
        """
        new_date = Date.Date()
        self.set_date(new_date,text)
        return new_date

#-------------------------------------------------------------------------
#
# French parser
#
#-------------------------------------------------------------------------
class DateParserFR(DateParser):

    modifier_to_int = {
        u'avant'    : Date.MOD_BEFORE,
        u'av.'      : Date.MOD_BEFORE,
        u'av'       : Date.MOD_BEFORE,
        u'apr\xe8s' : Date.MOD_AFTER,
        u'ap.'    : Date.MOD_AFTER,
        u'ap'     : Date.MOD_AFTER,
        u'env.'   : Date.MOD_ABOUT,
        u'env'    : Date.MOD_ABOUT,
        u'circa'  : Date.MOD_ABOUT,
        u'c.'     : Date.MOD_ABOUT,
        u'vers'   : Date.MOD_ABOUT,
        }

    calendar_to_int = {
        u'gr\xe9gorien'      : Date.CAL_GREGORIAN,
        u'g'                 : Date.CAL_GREGORIAN,
        u'julien'            : Date.CAL_JULIAN,
        u'j'                 : Date.CAL_JULIAN,
        u'h\xe9breu'         : Date.CAL_HEBREW,
        u'h'                 : Date.CAL_HEBREW,
        u'islamique'         : Date.CAL_ISLAMIC,
        u'i'                 : Date.CAL_ISLAMIC,
        u'r\xe9volutionnaire': Date.CAL_FRENCH,
        u'r'                 : Date.CAL_FRENCH,
        u'perse'             : Date.CAL_PERSIAN,
        u'p'                 : Date.CAL_PERSIAN,
        }

    quality_to_int = {
        u'estimated'  : Date.QUAL_ESTIMATED,
        u'est.'       : Date.QUAL_ESTIMATED,
        u'est'        : Date.QUAL_ESTIMATED,
        u'calc.'      : Date.QUAL_CALCULATED,
        u'calc'       : Date.QUAL_CALCULATED,
        u'calculated' : Date.QUAL_CALCULATED,
        }

    _span     = re.compile("de\s+(.+)\s+\xe0\s+(.+)",
                           re.IGNORECASE)
    _range    = re.compile("(ent.|ent|entre)\s+(.+)\s+et\s+(.+)",
                           re.IGNORECASE)

#-------------------------------------------------------------------------
#
# Russian parser
#
#-------------------------------------------------------------------------
class DateParserRU(DateParser):

    modifier_to_int = {
        u'до'    : Date.MOD_BEFORE,
        u'по'    : Date.MOD_BEFORE,
        u'после' : Date.MOD_AFTER,
        u'п.'    : Date.MOD_AFTER,
        u'с'     : Date.MOD_AFTER,
        u'ок.'   : Date.MOD_ABOUT,
        u'около'    : Date.MOD_ABOUT,
        u'примерно'  : Date.MOD_ABOUT,
        u'прим.'     : Date.MOD_ABOUT,
        u'приблизительно'  : Date.MOD_ABOUT,
        u'приб.'  : Date.MOD_ABOUT,
        }

    calendar_to_int = {
        u'григорианский'      : Date.CAL_GREGORIAN,
        u'г'                 : Date.CAL_GREGORIAN,
        u'юлианский'            : Date.CAL_JULIAN,
        u'ю'                 : Date.CAL_JULIAN,
        u'еврейский'         : Date.CAL_HEBREW,
        u'е'                 : Date.CAL_HEBREW,
        u'исламский'         : Date.CAL_ISLAMIC,
        u'и'                 : Date.CAL_ISLAMIC,
        u'республиканский': Date.CAL_FRENCH,
        u'р'                 : Date.CAL_FRENCH,
        u'персидский'             : Date.CAL_PERSIAN,
        u'п'                 : Date.CAL_PERSIAN,
        }

    quality_to_int = {
        u'оценено'  : Date.QUAL_ESTIMATED,
        u'оцен.'       : Date.QUAL_ESTIMATED,
        u'оц.'        : Date.QUAL_ESTIMATED,
        u'оцен'       : Date.QUAL_ESTIMATED,
        u'оц'        : Date.QUAL_ESTIMATED,
        u'вычислено'      : Date.QUAL_CALCULATED,
        u'вычисл.'       : Date.QUAL_CALCULATED,
        u'выч.' : Date.QUAL_CALCULATED,
        u'вычисл'       : Date.QUAL_CALCULATED,
        u'выч' : Date.QUAL_CALCULATED,
        }

    _span     = re.compile("(с|от)\s+(.+)\s+(по|до)\s+(.+)",
                           re.IGNORECASE)
    _range    = re.compile("(между|меж|меж.)\s+(.+)\s+и\s+(.+)",
                           re.IGNORECASE)