gramps/gramps2/src/DateParser.py
2004-11-10 04:20:47 +00:00

612 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#coding: utf-8
# Gramps - a GTK+/GNOME based genealogy program
#
# Copyright (C) 2004 Donald N. Allingham
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# $Id$
"""
U.S. English date parsing class. Serves as the base class for any localized
date parsing class.
"""
__author__ = "Donald N. Allingham"
__version__ = "$Revision$"
#-------------------------------------------------------------------------
#
# Python modules
#
#-------------------------------------------------------------------------
import re
import time
import locale
#-------------------------------------------------------------------------
#
# GRAMPS modules
#
#-------------------------------------------------------------------------
import Date
#-------------------------------------------------------------------------
#
# Top-level module functions
#
#-------------------------------------------------------------------------
_max_days = [ 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ]
def gregorian_valid(date_tuple):
day = date_tuple[0]
month = date_tuple[1]
valid = True
try:
if month > 12:
valid = False
elif day > _max_days[month-1]:
valid = False
except:
valid = False
return valid
#-------------------------------------------------------------------------
#
# Parser class
#
#-------------------------------------------------------------------------
class DateParser:
"""
Converts a text string into a Date object. If the date cannot be
converted, the text string is assigned.
"""
# determine the code set returned by nl_langinfo
_codeset = locale.nl_langinfo(locale.CODESET)
_fmt_parse = re.compile(".*%(\S).*%(\S).*%(\S).*")
# RFC-2822 only uses capitalized English abbreviated names, no locales.
_rfc_days = ('Sun','Mon','Tue','Wed','Thu','Fri','Sat')
_rfc_mons_to_int = {
'Jan' : 1,
'Feb' : 2,
'Mar' : 3,
'Apr' : 4,
'May' : 5,
'Jun' : 6,
'Jul' : 7,
'Aug' : 8,
'Sep' : 9,
'Oct' : 10,
'Nov' : 11,
'Dec' : 12,
}
month_to_int = {
unicode(locale.nl_langinfo(locale.MON_1),_codeset).lower() : 1,
unicode(locale.nl_langinfo(locale.ABMON_1),_codeset).lower() : 1,
unicode(locale.nl_langinfo(locale.MON_2),_codeset).lower() : 2,
unicode(locale.nl_langinfo(locale.ABMON_2),_codeset).lower() : 2,
unicode(locale.nl_langinfo(locale.MON_3),_codeset).lower() : 3,
unicode(locale.nl_langinfo(locale.ABMON_3),_codeset).lower() : 3,
unicode(locale.nl_langinfo(locale.MON_4),_codeset).lower() : 4,
unicode(locale.nl_langinfo(locale.ABMON_4),_codeset).lower() : 4,
unicode(locale.nl_langinfo(locale.MON_5),_codeset).lower() : 5,
unicode(locale.nl_langinfo(locale.ABMON_5),_codeset).lower() : 5,
unicode(locale.nl_langinfo(locale.MON_6),_codeset).lower() : 6,
unicode(locale.nl_langinfo(locale.ABMON_6),_codeset).lower() : 6,
unicode(locale.nl_langinfo(locale.MON_7),_codeset).lower() : 7,
unicode(locale.nl_langinfo(locale.ABMON_7),_codeset).lower() : 7,
unicode(locale.nl_langinfo(locale.MON_8),_codeset).lower() : 8,
unicode(locale.nl_langinfo(locale.ABMON_8),_codeset).lower() : 8,
unicode(locale.nl_langinfo(locale.MON_9),_codeset).lower() : 9,
unicode(locale.nl_langinfo(locale.ABMON_9),_codeset).lower() : 9,
unicode(locale.nl_langinfo(locale.MON_10),_codeset).lower() : 10,
unicode(locale.nl_langinfo(locale.ABMON_10),_codeset).lower(): 10,
unicode(locale.nl_langinfo(locale.MON_11),_codeset).lower() : 11,
unicode(locale.nl_langinfo(locale.ABMON_11),_codeset).lower(): 11,
unicode(locale.nl_langinfo(locale.MON_12),_codeset).lower() : 12,
unicode(locale.nl_langinfo(locale.ABMON_12),_codeset).lower(): 12,
}
modifier_to_int = {
'before' : Date.MOD_BEFORE, 'bef' : Date.MOD_BEFORE,
'bef.' : Date.MOD_BEFORE, 'after' : Date.MOD_AFTER,
'aft' : Date.MOD_AFTER, 'aft.' : Date.MOD_AFTER,
'about' : Date.MOD_ABOUT, 'abt.' : Date.MOD_ABOUT,
'abt' : Date.MOD_ABOUT, 'circa' : Date.MOD_ABOUT,
'c.' : Date.MOD_ABOUT, 'around' : Date.MOD_ABOUT,
}
hebrew_to_int = {
"tishri" : 1, "heshvan" : 2, "kislev" : 3,
"tevet" : 4, "shevat" : 5, "adari" : 6,
"adarii" : 7, "nisan" : 8, "iyyar" : 9,
"sivan" : 10, "tammuz" : 11, "av" : 12,
"elul" : 13,
}
french_to_int = {
u'vend\xe9miaire' : 1, u'brumaire' : 2,
u'frimaire' : 3, u'niv\xf4se ': 4,
u'pluvi\xf4se' : 5, u'vent\xf4se' : 6,
u'germinal' : 7, u'flor\xe9al' : 8,
u'prairial' : 9, u'messidor' : 10,
u'thermidor' : 11, u'fructidor' : 12,
u'extra' : 13
}
islamic_to_int = {
"muharram" : 1, "muharram ul haram" : 1,
"safar" : 2, "rabi`al-awwal" : 3,
"rabi'l" : 3, "rabi`ul-akhir" : 4,
"rabi`ath-thani" : 4, "rabi` ath-thani" : 4,
"rabi`al-thaany" : 4, "rabi` al-thaany" : 4,
"rabi' ii" : 4, "jumada l-ula" : 5,
"jumaada-ul-awwal" : 5, "jumaada i" : 5,
"jumada t-tania" : 6, "jumaada-ul-akhir" : 6,
"jumaada al-thaany" : 6, "jumaada ii" : 5,
"rajab" : 7, "sha`ban" : 8,
"sha`aban" : 8, "ramadan" : 9,
"ramadhan" : 9, "shawwal" : 10,
"dhu l-qa`da" : 11, "dhu qadah" : 11,
"thw al-qi`dah" : 11, "dhu l-hijja" : 12,
"dhu hijja" : 12, "thw al-hijjah" : 12,
}
persian_to_int = {
"Farvardin" : 1, "Ordibehesht" : 2,
"Khordad" : 3, "Tir" : 4,
"Mordad" : 5, "Shahrivar" : 6,
"Mehr" : 7, "Aban" : 8,
"Azar" : 9, "Dey" : 10,
"Bahman" : 11, "Esfand" : 12,
}
bce = ["BC", "B\.C", "B\.C\.", "BCE", "B\.C\.E", "B\.C\.E"]
calendar_to_int = {
'gregorian' : Date.CAL_GREGORIAN,
'g' : Date.CAL_GREGORIAN,
'julian' : Date.CAL_JULIAN,
'j' : Date.CAL_JULIAN,
'hebrew' : Date.CAL_HEBREW,
'h' : Date.CAL_HEBREW,
'islamic' : Date.CAL_ISLAMIC,
'i' : Date.CAL_ISLAMIC,
'french' : Date.CAL_FRENCH,
'french republican': Date.CAL_FRENCH,
'f' : Date.CAL_FRENCH,
'persian' : Date.CAL_PERSIAN,
'p' : Date.CAL_PERSIAN,
}
quality_to_int = {
'estimated' : Date.QUAL_ESTIMATED,
'est.' : Date.QUAL_ESTIMATED,
'est' : Date.QUAL_ESTIMATED,
'calc.' : Date.QUAL_CALCULATED,
'calc' : Date.QUAL_CALCULATED,
'calculated' : Date.QUAL_CALCULATED,
}
_rfc_mon_str = '(' + '|'.join(_rfc_mons_to_int.keys()) + ')'
_rfc_day_str = '(' + '|'.join(_rfc_days) + ')'
_bce_str = '(' + '|'.join(bce) + ')'
_qual_str = '(' + '|'.join(
[ key.replace('.','\.') for key in quality_to_int.keys() ]
) + ')'
_mod_str = '(' + '|'.join(
[ key.replace('.','\.') for key in modifier_to_int.keys() ]
) + ')'
_mon_str = '(' + '|'.join(month_to_int.keys()) + ')'
_jmon_str = '(' + '|'.join(hebrew_to_int.keys()) + ')'
_fmon_str = '(' + '|'.join(french_to_int.keys()) + ')'
_pmon_str = '(' + '|'.join(persian_to_int.keys()) + ')'
_cal_str = '(' + '|'.join(calendar_to_int.keys()) + ')'
_imon_str = '(' + '|'.join(islamic_to_int.keys()) + ')'
_bce_re = re.compile("(.+)\s+%s" % _bce_str)
_cal = re.compile("(.+)\s\(%s\)" % _cal_str,
re.IGNORECASE)
_qual = re.compile("%s\s+(.+)" % _qual_str,
re.IGNORECASE)
_span = re.compile("from\s+(.+)\s+to\s+(.+)",
re.IGNORECASE)
_range = re.compile("(bet.|between)\s+(.+)\s+and\s+(.+)",
re.IGNORECASE)
_modifier = re.compile('%s\s+(.*)' % _mod_str,
re.IGNORECASE)
_text = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _mon_str,
re.IGNORECASE)
_text2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _mon_str,
re.IGNORECASE)
_jtext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _jmon_str,
re.IGNORECASE)
_jtext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _jmon_str,
re.IGNORECASE)
_ftext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _fmon_str,
re.IGNORECASE)
_ftext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _fmon_str,
re.IGNORECASE)
_ptext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _pmon_str,
re.IGNORECASE)
_ptext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _pmon_str,
re.IGNORECASE)
_itext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?' % _imon_str,
re.IGNORECASE)
_itext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?' % _imon_str,
re.IGNORECASE)
_range2 = re.compile('%s\s+(\d+)-(\d+)\s*,?\s*((\d+)(/\d+)?)?' % _mon_str,
re.IGNORECASE)
_numeric = re.compile("((\d+)[/\.])?((\d+)[/\.])?(\d+)")
_iso = re.compile("(\d+)-(\d+)-(\d+)")
_rfc = re.compile("(%s,)?\s+(\d|\d\d)\s+%s\s+(\d+)\s+\d\d:\d\d(:\d\d)?\s+(\+|-)\d\d\d\d"
% (_rfc_day_str,_rfc_mon_str))
def __init__(self):
self.parser = {
Date.CAL_GREGORIAN : self._parse_greg_julian,
Date.CAL_JULIAN : self._parse_greg_julian,
Date.CAL_PERSIAN : self._parse_persian,
Date.CAL_HEBREW : self._parse_hebrew,
Date.CAL_ISLAMIC : self._parse_islamic,
}
fmt = locale.nl_langinfo(locale.D_FMT)
match = self._fmt_parse.match(fmt.lower())
if match:
self.dmy = (match.groups() == ('d','m','y'))
else:
self.dmy = True
def _get_int(self,val):
"""
Converts the string to an integer if the value is not None. If the
value is None, a zero is returned
"""
if val == None:
return 0
else:
return int(val)
def _parse_hebrew(self,text):
return self._parse_calendar(text,self._jtext,self._jtext2,
self.hebrew_to_int)
def _parse_islamic(self,text):
return self._parse_calendar(text,self._itext,self._itext2,
self.islamic_to_int)
def _parse_persian(self,text):
return self._parse_calendar(text,self._ptext,self._ptext2,
self.persian_to_int)
def _parse_french(self,text):
return self._parse_calendar(text,self._ftext,self._ftext2,
self.french_to_int)
def _parse_greg_julian(self,text):
return self._parse_calendar(text,self._text,self._text2,
self.month_to_int)
def _parse_calendar(self,text,regex1,regex2,mmap):
match = regex1.match(text)
if match:
groups = match.groups()
if groups[0] == None:
m = 0
else:
m = mmap[groups[0].lower()]
if groups[2] == None:
y = self._get_int(groups[1])
d = 0
s = None
else:
d = self._get_int(groups[1])
y = int(groups[3])
s = groups[4] != None
return (d,m,y,s)
match = regex2.match(text)
if match:
groups = match.groups()
if groups[1] == None:
m = 0
else:
m = mmap[groups[1].lower()]
d = self._get_int(groups[0])
if groups[2] == None:
y = 0
s = None
else:
y = int(groups[3])
s = groups[4] != None
return (d,m,y,s)
return Date.EMPTY
def _parse_subdate(self,text,subparser=None):
"""
Converts only the date portion of a date.
"""
if subparser == None:
subparser = self._parse_greg_julian
value = subparser(text)
if value != Date.EMPTY:
return value
match = self._iso.match(text)
if match:
groups = match.groups()
y = self._get_int(groups[0])
m = self._get_int(groups[1])
d = self._get_int(groups[2])
if gregorian_valid((d,m)):
return (d,m,y,False)
else:
return Date.EMPTY
match = self._rfc.match(text)
if match:
groups = match.groups()
d = self._get_int(groups[2])
m = self._rfc_mons_to_int[groups[3]]
y = self._get_int(groups[4])
if gregorian_valid((d,m)):
return (d,m,y,False)
else:
return Date.EMPTY
match = self._numeric.match(text)
if match:
groups = match.groups()
if self.dmy:
m = self._get_int(groups[3])
d = self._get_int(groups[1])
else:
m = self._get_int(groups[1])
d = self._get_int(groups[3])
y = self._get_int(groups[4])
if gregorian_valid((d,m)):
return (d,m,y,False)
else:
return Date.EMPTY
return Date.EMPTY
def set_date(self,date,text):
"""
Parses the text and sets the date according to the parsing.
"""
date.set_text_value(text)
qual = Date.QUAL_NONE
cal = Date.CAL_GREGORIAN
match = self._cal.match(text)
if match:
grps = match.groups()
cal = self.calendar_to_int[grps[1].lower()]
text = grps[0]
text_parser = self.parser[cal]
match = self._qual.match(text)
if match:
grps = match.groups()
qual = self.quality_to_int[grps[0].lower()]
text = grps[1]
match = self._span.match(text)
if match:
grps = match.groups()
start = self._parse_subdate(grps[0],text_parser)
stop = self._parse_subdate(grps[1],text_parser)
date.set(qual,Date.MOD_SPAN,cal,start + stop)
return
match = self._range.match(text)
if match:
grps = match.groups()
start = self._parse_subdate(grps[1],text_parser)
stop = self._parse_subdate(grps[2],text_parser)
date.set(qual,Date.MOD_RANGE,cal,start + stop)
return
match = self._range2.match(text)
if match:
grps = match.groups()
m = self.month_to_int[grps[0].lower()]
d0 = self._get_int(grps[1])
d1 = self._get_int(grps[2])
if grps[3] == None:
y = 0
s = None
else:
y = int(grps[3])
s = grps[4] != None
date.set(qual,Date.MOD_RANGE,Date.CAL_GREGORIAN,
(d0,m,y,s,d1,m,y,s))
return
match = self._bce_re.match(text)
bc = False
if match:
text = match.groups()[0]
bc = True
match = self._modifier.match(text)
if match:
grps = match.groups()
start = self._parse_subdate(grps[1])
mod = self.modifier_to_int.get(grps[0].lower(),Date.MOD_NONE)
if bc:
date.set(qual,mod,cal,self.invert_year(start))
else:
date.set(qual,mod,cal,start)
return
subdate = self._parse_subdate(text)
if subdate == Date.EMPTY:
subdate = self._parse_hebrew(text)
if subdate == Date.EMPTY:
subdate = self._parse_persian(text)
if subdate == Date.EMPTY:
subdate = self._parse_islamic(text)
if subdate == Date.EMPTY:
subdate = self._parse_french(text)
if subdate == Date.EMPTY:
date.set_as_text(text)
return
else:
cal = Date.CAL_FRENCH
else:
cal = Date.CAL_ISLAMIC
else:
cal = Date.CAL_PERSIAN
else:
cal = Date.CAL_HEBREW
if bc:
date.set(qual,Date.MOD_NONE,cal,self.invert_year(subdate))
else:
date.set(qual,Date.MOD_NONE,cal,subdate)
def invert_year(self,subdate):
return (subdate[0],subdate[1],-subdate[2],subdate[3])
def parse(self,text):
"""
Parses the text, returning a Date object.
"""
new_date = Date.Date()
self.set_date(new_date,text)
return new_date
#-------------------------------------------------------------------------
#
# French parser
#
#-------------------------------------------------------------------------
class DateParserFR(DateParser):
modifier_to_int = {
u'avant' : Date.MOD_BEFORE,
u'av.' : Date.MOD_BEFORE,
u'av' : Date.MOD_BEFORE,
u'apr\xe8s' : Date.MOD_AFTER,
u'ap.' : Date.MOD_AFTER,
u'ap' : Date.MOD_AFTER,
u'env.' : Date.MOD_ABOUT,
u'env' : Date.MOD_ABOUT,
u'circa' : Date.MOD_ABOUT,
u'c.' : Date.MOD_ABOUT,
u'vers' : Date.MOD_ABOUT,
}
calendar_to_int = {
u'gr\xe9gorien' : Date.CAL_GREGORIAN,
u'g' : Date.CAL_GREGORIAN,
u'julien' : Date.CAL_JULIAN,
u'j' : Date.CAL_JULIAN,
u'h\xe9breu' : Date.CAL_HEBREW,
u'h' : Date.CAL_HEBREW,
u'islamique' : Date.CAL_ISLAMIC,
u'i' : Date.CAL_ISLAMIC,
u'r\xe9volutionnaire': Date.CAL_FRENCH,
u'r' : Date.CAL_FRENCH,
u'perse' : Date.CAL_PERSIAN,
u'p' : Date.CAL_PERSIAN,
}
quality_to_int = {
u'estimated' : Date.QUAL_ESTIMATED,
u'est.' : Date.QUAL_ESTIMATED,
u'est' : Date.QUAL_ESTIMATED,
u'calc.' : Date.QUAL_CALCULATED,
u'calc' : Date.QUAL_CALCULATED,
u'calculated' : Date.QUAL_CALCULATED,
}
_span = re.compile("de\s+(.+)\s+\xe0\s+(.+)",
re.IGNORECASE)
_range = re.compile("(ent.|ent|entre)\s+(.+)\s+et\s+(.+)",
re.IGNORECASE)
#-------------------------------------------------------------------------
#
# Russian parser
#
#-------------------------------------------------------------------------
class DateParserRU(DateParser):
modifier_to_int = {
u'до' : Date.MOD_BEFORE,
u'по' : Date.MOD_BEFORE,
u'после' : Date.MOD_AFTER,
u'п.' : Date.MOD_AFTER,
u'с' : Date.MOD_AFTER,
u'ок.' : Date.MOD_ABOUT,
u'около' : Date.MOD_ABOUT,
u'примерно' : Date.MOD_ABOUT,
u'прим.' : Date.MOD_ABOUT,
u'приблизительно' : Date.MOD_ABOUT,
u'приб.' : Date.MOD_ABOUT,
}
calendar_to_int = {
u'григорианский' : Date.CAL_GREGORIAN,
u'г' : Date.CAL_GREGORIAN,
u'юлианский' : Date.CAL_JULIAN,
u'ю' : Date.CAL_JULIAN,
u'еврейский' : Date.CAL_HEBREW,
u'е' : Date.CAL_HEBREW,
u'исламский' : Date.CAL_ISLAMIC,
u'и' : Date.CAL_ISLAMIC,
u'республиканский': Date.CAL_FRENCH,
u'р' : Date.CAL_FRENCH,
u'персидский' : Date.CAL_PERSIAN,
u'п' : Date.CAL_PERSIAN,
}
quality_to_int = {
u'оценено' : Date.QUAL_ESTIMATED,
u'оцен.' : Date.QUAL_ESTIMATED,
u'оц.' : Date.QUAL_ESTIMATED,
u'оцен' : Date.QUAL_ESTIMATED,
u'оц' : Date.QUAL_ESTIMATED,
u'вычислено' : Date.QUAL_CALCULATED,
u'вычисл.' : Date.QUAL_CALCULATED,
u'выч.' : Date.QUAL_CALCULATED,
u'вычисл' : Date.QUAL_CALCULATED,
u'выч' : Date.QUAL_CALCULATED,
}
_span = re.compile("(с|от)\s+(.+)\s+(по|до)\s+(.+)",
re.IGNORECASE)
_range = re.compile("(между|меж|меж.)\s+(.+)\s+и\s+(.+)",
re.IGNORECASE)