Package DateHandler :: Module _DateParser
[frames] | no frames]

Source Code for Module DateHandler._DateParser

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Gramps - a GTK+/GNOME based genealogy program 
  4  # 
  5  # Copyright (C) 2004-2006  Donald N. Allingham 
  6  # 
  7  # This program is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  # 
 12  # This program is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with this program; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22  # $Id: _DateParser.py 10121 2008-02-26 00:14:36Z dsblank $ 
 23   
 24  """ 
 25  Date parsing class. Serves as the base class for any localized 
 26  date parsing class. The default, base class provides parsing for 
 27  English. 
 28  """ 
 29   
 30  #------------------------------------------------------------------------- 
 31  # 
 32  # Python modules 
 33  # 
 34  #------------------------------------------------------------------------- 
 35  import re 
 36  import calendar 
 37   
 38  #------------------------------------------------------------------------- 
 39  # 
 40  # set up logging 
 41  # 
 42  #------------------------------------------------------------------------- 
 43  import logging 
 44  log = logging.getLogger(".DateParser") 
 45   
 46  #------------------------------------------------------------------------- 
 47  # 
 48  # GRAMPS modules 
 49  # 
 50  #------------------------------------------------------------------------- 
 51  from gen.lib import Date, DateError 
 52  import GrampsLocale 
 53   
 54  #------------------------------------------------------------------------- 
 55  # 
 56  # Top-level module functions 
 57  # 
 58  #------------------------------------------------------------------------- 
 59  _max_days  = [ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ] 
 60  _leap_days = [ 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ] 
 61   
62 -def gregorian_valid(date_tuple):
63 day = date_tuple[0] 64 month = date_tuple[1] 65 valid = True 66 try: 67 if month > 12: 68 valid = False 69 elif calendar.isleap(date_tuple[2]): 70 if day > _leap_days[month-1]: 71 valid = False 72 elif day > _max_days[month-1]: 73 valid = False 74 except: 75 valid = False 76 return valid
77 78 #------------------------------------------------------------------------- 79 # 80 # Parser class 81 # 82 #-------------------------------------------------------------------------
83 -class DateParser:
84 """ 85 Convert a text string into a Date object. If the date cannot be 86 converted, the text string is assigned. 87 """ 88 89 _fmt_parse = re.compile(".*%(\S).*%(\S).*%(\S).*") 90 91 # RFC-2822 only uses capitalized English abbreviated names, no locales. 92 _rfc_days = ('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat') 93 _rfc_mons_to_int = { 94 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 95 'May' : 5, 'Jun' : 6, 'Jul' : 7, 'Aug' : 8, 96 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12, 97 } 98 99 month_to_int = GrampsLocale.month_to_int 100 101 # modifiers before the date 102 modifier_to_int = { 103 'before' : Date.MOD_BEFORE, 'bef' : Date.MOD_BEFORE, 104 'bef.' : Date.MOD_BEFORE, 'after' : Date.MOD_AFTER, 105 'aft' : Date.MOD_AFTER, 'aft.' : Date.MOD_AFTER, 106 'about' : Date.MOD_ABOUT, 'abt.' : Date.MOD_ABOUT, 107 'abt' : Date.MOD_ABOUT, 'circa' : Date.MOD_ABOUT, 108 'c.' : Date.MOD_ABOUT, 'around' : Date.MOD_ABOUT, 109 } 110 # in some languages some of above listed modifiers are after the date, 111 # in that case the subclass should put them into this dictionary instead 112 modifier_after_to_int = {} 113 114 hebrew_to_int = { 115 "tishri" : 1, "heshvan" : 2, "kislev" : 3, 116 "tevet" : 4, "shevat" : 5, "adari" : 6, 117 "adarii" : 7, "nisan" : 8, "iyyar" : 9, 118 "sivan" : 10, "tammuz" : 11, "av" : 12, 119 "elul" : 13, 120 } 121 122 french_to_int = { 123 u'vendémiaire' : 1, u'brumaire' : 2, 124 u'frimaire' : 3, u'nivôse': 4, 125 u'pluviôse' : 5, u'ventôse' : 6, 126 u'germinal' : 7, u'floréal' : 8, 127 u'prairial' : 9, u'messidor' : 10, 128 u'thermidor' : 11, u'fructidor' : 12, 129 u'extra' : 13 130 } 131 132 islamic_to_int = { 133 "muharram" : 1, "muharram ul haram" : 1, 134 "safar" : 2, "rabi`al-awwal" : 3, 135 "rabi'l" : 3, "rabi`ul-akhir" : 4, 136 "rabi`ath-thani" : 4, "rabi` ath-thani" : 4, 137 "rabi`al-thaany" : 4, "rabi` al-thaany" : 4, 138 "rabi' ii" : 4, "jumada l-ula" : 5, 139 "jumaada-ul-awwal" : 5, "jumaada i" : 5, 140 "jumada t-tania" : 6, "jumaada-ul-akhir" : 6, 141 "jumaada al-thaany" : 6, "jumaada ii" : 5, 142 "rajab" : 7, "sha`ban" : 8, 143 "sha`aban" : 8, "ramadan" : 9, 144 "ramadhan" : 9, "shawwal" : 10, 145 "dhu l-qa`da" : 11, "dhu qadah" : 11, 146 "thw al-qi`dah" : 11, "dhu l-hijja" : 12, 147 "dhu hijja" : 12, "thw al-hijjah" : 12, 148 } 149 150 persian_to_int = { 151 "farvardin" : 1, "ordibehesht" : 2, 152 "khordad" : 3, "tir" : 4, 153 "mordad" : 5, "shahrivar" : 6, 154 "mehr" : 7, "aban" : 8, 155 "azar" : 9, "dey" : 10, 156 "bahman" : 11, "esfand" : 12, 157 } 158 159 bce = ["B.C.E.", "B.C.E", "BCE", "B.C.", "B.C", "BC" ] 160 161 calendar_to_int = { 162 'gregorian' : Date.CAL_GREGORIAN, 163 'g' : Date.CAL_GREGORIAN, 164 'julian' : Date.CAL_JULIAN, 165 'j' : Date.CAL_JULIAN, 166 'hebrew' : Date.CAL_HEBREW, 167 'h' : Date.CAL_HEBREW, 168 'islamic' : Date.CAL_ISLAMIC, 169 'i' : Date.CAL_ISLAMIC, 170 'french' : Date.CAL_FRENCH, 171 'french republican': Date.CAL_FRENCH, 172 'f' : Date.CAL_FRENCH, 173 'persian' : Date.CAL_PERSIAN, 174 'p' : Date.CAL_PERSIAN, 175 } 176 177 quality_to_int = { 178 'estimated' : Date.QUAL_ESTIMATED, 179 'est.' : Date.QUAL_ESTIMATED, 180 'est' : Date.QUAL_ESTIMATED, 181 'calc.' : Date.QUAL_CALCULATED, 182 'calc' : Date.QUAL_CALCULATED, 183 'calculated' : Date.QUAL_CALCULATED, 184 } 185
186 - def __init__(self):
187 self.init_strings() 188 self.parser = { 189 Date.CAL_GREGORIAN : self._parse_greg_julian, 190 Date.CAL_JULIAN : self._parse_greg_julian, 191 Date.CAL_FRENCH : self._parse_french, 192 Date.CAL_PERSIAN : self._parse_persian, 193 Date.CAL_HEBREW : self._parse_hebrew, 194 Date.CAL_ISLAMIC : self._parse_islamic, 195 } 196 197 fmt = GrampsLocale.tformat 198 match = self._fmt_parse.match(fmt.lower()) 199 if match: 200 self.dmy = (match.groups() == ('d', 'm', 'y')) 201 self.ymd = (match.groups() == ('y', 'm', 'd')) 202 else: 203 self.dmy = True 204 self.ymd = False
205
206 - def re_longest_first(self, keys):
207 """ 208 returns a string for a RE group which contains the given keys 209 sorted so that longest keys match first. Any '.' characters 210 are quoted. 211 """ 212 keys.sort(lambda x, y: cmp(len(y), len(x))) 213 return '(' + '|'.join([key.replace('.', '\.') for key in keys]) + ')'
214
215 - def init_strings(self):
216 """ 217 This method compiles regular expression strings for matching dates. 218 219 Most of the re's in most languages can stay as is. span and range 220 most likely will need to change. Whatever change is done, this method 221 may be called first as DateParser.init_strings(self) so that the 222 invariant expresions don't need to be repeteadly coded. All differences 223 can be coded after DateParser.init_strings(self) call, that way they 224 override stuff from this method. See DateParserRU() as an example. 225 """ 226 self._rfc_mon_str = '(' + '|'.join(self._rfc_mons_to_int.keys()) + ')' 227 self._rfc_day_str = '(' + '|'.join(self._rfc_days) + ')' 228 229 self._bce_str = self.re_longest_first(self.bce) 230 self._qual_str = self.re_longest_first(self.quality_to_int.keys()) 231 self._mod_str = self.re_longest_first(self.modifier_to_int.keys()) 232 self._mod_after_str = self.re_longest_first( 233 self.modifier_after_to_int.keys()) 234 235 self._mon_str = self.re_longest_first(self.month_to_int.keys()) 236 self._jmon_str = self.re_longest_first(self.hebrew_to_int.keys()) 237 self._fmon_str = self.re_longest_first(self.french_to_int.keys()) 238 self._pmon_str = self.re_longest_first(self.persian_to_int.keys()) 239 self._imon_str = self.re_longest_first(self.islamic_to_int.keys()) 240 self._cal_str = self.re_longest_first(self.calendar_to_int.keys()) 241 242 # bce, calendar type and quality may be either at the end or at 243 # the beginning of the given date string, therefore they will 244 # be parsed from the middle and will be in match.group(2). 245 self._bce_re = re.compile("(.*)\s+%s( ?.*)" % self._bce_str) 246 247 self._cal = re.compile("(.*)\s+\(%s\)( ?.*)" % self._cal_str, 248 re.IGNORECASE) 249 self._qual = re.compile("(.* ?)%s\s+(.+)" % self._qual_str, 250 re.IGNORECASE) 251 252 self._span = re.compile("(from)\s+(?P<start>.+)\s+to\s+(?P<stop>.+)", 253 re.IGNORECASE) 254 self._range = re.compile("(bet|bet.|between)\s+(?P<start>.+)\s+and\s+(?P<stop>.+)", 255 re.IGNORECASE) 256 self._modifier = re.compile('%s\s+(.*)' % self._mod_str, 257 re.IGNORECASE) 258 self._modifier_after = re.compile('(.*)\s+%s' % self._mod_after_str, 259 re.IGNORECASE) 260 self._abt2 = re.compile('<(.*)>', re.IGNORECASE) 261 self._text = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._mon_str, 262 re.IGNORECASE) 263 self._text2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._mon_str, 264 re.IGNORECASE) 265 self._jtext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._jmon_str, 266 re.IGNORECASE) 267 self._jtext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._jmon_str, 268 re.IGNORECASE) 269 self._ftext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._fmon_str, 270 re.IGNORECASE) 271 self._ftext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._fmon_str, 272 re.IGNORECASE) 273 self._ptext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._pmon_str, 274 re.IGNORECASE) 275 self._ptext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._pmon_str, 276 re.IGNORECASE) 277 self._itext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._imon_str, 278 re.IGNORECASE) 279 self._itext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._imon_str, 280 re.IGNORECASE) 281 self._numeric = re.compile("((\d+)[/\.]\s*)?((\d+)[/\.]\s*)?(\d+)\s*$") 282 self._iso = re.compile("(\d+)(/(\d+))?-(\d+)-(\d+)\s*$") 283 self._rfc = re.compile("(%s,)?\s+(\d|\d\d)\s+%s\s+(\d+)\s+\d\d:\d\d(:\d\d)?\s+(\+|-)\d\d\d\d" 284 % (self._rfc_day_str, self._rfc_mon_str))
285
286 - def _get_int(self, val):
287 """ 288 Convert the string to an integer if the value is not None. If the 289 value is None, a zero is returned 290 """ 291 if val == None: 292 return 0 293 else: 294 return int(val)
295
296 - def _parse_hebrew(self, text):
297 return self._parse_calendar(text, self._jtext, self._jtext2, 298 self.hebrew_to_int)
299
300 - def _parse_islamic(self, text):
301 return self._parse_calendar(text, self._itext, self._itext2, 302 self.islamic_to_int)
303
304 - def _parse_persian(self, text):
305 return self._parse_calendar(text, self._ptext, self._ptext2, 306 self.persian_to_int)
307
308 - def _parse_french(self, text):
309 return self._parse_calendar(text, self._ftext, self._ftext2, 310 self.french_to_int)
311
312 - def _parse_greg_julian(self, text):
313 return self._parse_calendar(text, self._text, self._text2, 314 self.month_to_int, gregorian_valid)
315
316 - def _parse_calendar(self, text, regex1, regex2, mmap, check=None):
317 match = regex1.match(text.lower()) 318 if match: 319 groups = match.groups() 320 if groups[0] == None: 321 m = 0 322 else: 323 m = mmap[groups[0].lower()] 324 325 if groups[2] == None: 326 y = self._get_int(groups[1]) 327 d = 0 328 s = False 329 else: 330 d = self._get_int(groups[1]) 331 y = int(groups[3]) 332 s = groups[4] != None 333 value = (d, m, y, s) 334 if check and not check((d, m, y)): 335 value = Date.EMPTY 336 return value 337 338 match = regex2.match(text.lower()) 339 if match: 340 groups = match.groups() 341 if groups[1] == None: 342 m = 0 343 else: 344 m = mmap[groups[1].lower()] 345 346 d = self._get_int(groups[0]) 347 348 if groups[2] == None: 349 y = None 350 s = False 351 else: 352 y = int(groups[3]) 353 s = groups[4] != None 354 value = (d, m, y, s) 355 if check and not check((d, m, y)): 356 value = Date.EMPTY 357 return value 358 359 return Date.EMPTY
360
361 - def _parse_subdate(self, text, subparser=None):
362 """ 363 Convert only the date portion of a date. 364 """ 365 if subparser == None: 366 subparser = self._parse_greg_julian 367 368 if subparser == self._parse_greg_julian: 369 check = gregorian_valid 370 else: 371 check = None 372 373 value = subparser(text) 374 if value != Date.EMPTY: 375 return value 376 377 match = self._iso.match(text) 378 if match: 379 groups = match.groups() 380 y = self._get_int(groups[0]) 381 m = self._get_int(groups[3]) 382 d = self._get_int(groups[4]) 383 if check and not check((d, m, y)): 384 return Date.EMPTY 385 if groups[2]: 386 return (d, m, y, True) 387 else: 388 return (d, m, y, False) 389 390 match = self._rfc.match(text) 391 if match: 392 groups = match.groups() 393 d = self._get_int(groups[2]) 394 m = self._rfc_mons_to_int[groups[3]] 395 y = self._get_int(groups[4]) 396 value = (d, m, y, False) 397 if check and not check((d, m, y)): 398 value = Date.EMPTY 399 return value 400 401 match = self._numeric.match(text) 402 if match: 403 groups = match.groups() 404 if self.ymd: 405 # '1789' and ymd: incomplete date 406 if groups[1] == None: 407 y = self._get_int(groups[4]) 408 m = 0 409 d = 0 410 else: 411 y = self._get_int(groups[1]) 412 m = self._get_int(groups[3]) 413 d = self._get_int(groups[4]) 414 else: 415 y = self._get_int(groups[4]) 416 if self.dmy: 417 m = self._get_int(groups[3]) 418 d = self._get_int(groups[1]) 419 else: 420 m = self._get_int(groups[1]) 421 d = self._get_int(groups[3]) 422 value = (d, m, y, False) 423 if check and not check((d, m, y)): 424 value = Date.EMPTY 425 return value 426 427 return Date.EMPTY
428
429 - def match_calendar(self, text, cal):
430 """ 431 Try parsing calendar. 432 433 Return calendar index and the text with calendar removed. 434 """ 435 match = self._cal.match(text) 436 if match: 437 cal = self.calendar_to_int[match.group(2).lower()] 438 text = match.group(1) + match.group(3) 439 return (text, cal)
440
441 - def match_quality(self, text, qual):
442 """ 443 Try matching quality. 444 445 Return quality index and the text with quality removed. 446 """ 447 match = self._qual.match(text) 448 if match: 449 qual = self.quality_to_int[match.group(2).lower()] 450 text = match.group(1) + match.group(3) 451 return (text, qual)
452
453 - def match_span(self, text, cal, qual, date):
454 """ 455 Try matching span date. 456 457 On success, set the date and return 1. On failure return 0. 458 """ 459 match = self._span.match(text) 460 if match: 461 text_parser = self.parser[cal] 462 (text1, bc1) = self.match_bce(match.group('start')) 463 start = self._parse_subdate(text1, text_parser) 464 if bc1: 465 start = self.invert_year(start) 466 467 (text2, bc2) = self.match_bce(match.group('stop')) 468 stop = self._parse_subdate(text2, text_parser) 469 if bc2: 470 stop = self.invert_year(stop) 471 472 date.set(qual, Date.MOD_SPAN, cal, start + stop) 473 return 1 474 return 0
475
476 - def match_range(self, text, cal, qual, date):
477 """ 478 Try matching range date. 479 480 On success, set the date and return 1. On failure return 0. 481 """ 482 match = self._range.match(text) 483 if match: 484 text_parser = self.parser[cal] 485 (text1, bc1) = self.match_bce(match.group('start')) 486 start = self._parse_subdate(text1, text_parser) 487 if bc1: 488 start = self.invert_year(start) 489 490 (text2, bc2) = self.match_bce(match.group('stop')) 491 stop = self._parse_subdate(text2, text_parser) 492 if bc2: 493 stop = self.invert_year(stop) 494 495 date.set(qual, Date.MOD_RANGE, cal, start + stop) 496 return 1 497 return 0
498
499 - def match_bce(self, text):
500 """ 501 Try matching BCE qualifier. 502 503 Return BCE (True/False) and the text with matched part removed. 504 """ 505 match = self._bce_re.match(text) 506 bc = False 507 if match: 508 # bce is in the match.group(2) 509 try: 510 text = match.group(1) + match.group(3) 511 except: 512 print "MATCH:", match.groups() 513 bc = True 514 return (text, bc)
515
516 - def match_modifier(self, text, cal, qual, bc, date):
517 """ 518 Try matching date with modifier. 519 520 On success, set the date and return 1. On failure return 0. 521 """ 522 # modifiers before the date 523 match = self._modifier.match(text) 524 if match: 525 grps = match.groups() 526 start = self._parse_subdate(grps[1], self.parser[cal]) 527 mod = self.modifier_to_int.get(grps[0].lower(), Date.MOD_NONE) 528 if start == Date.EMPTY: 529 date.set_modifier(Date.MOD_TEXTONLY) 530 date.set_text_value(text) 531 elif bc: 532 date.set(qual, mod, cal, self.invert_year(start)) 533 else: 534 date.set(qual, mod, cal, start) 535 return True 536 # modifiers after the date 537 if self.modifier_after_to_int: 538 match = self._modifier_after.match(text) 539 if match: 540 grps = match.groups() 541 start = self._parse_subdate(grps[0], self.parser[cal]) 542 mod = self.modifier_after_to_int.get(grps[1].lower(), 543 Date.MOD_NONE) 544 if bc: 545 date.set(qual, mod, cal, self.invert_year(start)) 546 else: 547 date.set(qual, mod, cal, start) 548 return True 549 match = self._abt2.match(text) 550 if match: 551 grps = match.groups() 552 start = self._parse_subdate(grps[0]) 553 mod = Date.MOD_ABOUT 554 if bc: 555 date.set(qual, mod, cal, self.invert_year(start)) 556 else: 557 date.set(qual, mod, cal, start) 558 return True 559 return False
560
561 - def set_date(self, date, text):
562 """ 563 Parses the text and sets the date according to the parsing. 564 """ 565 566 date.set_text_value(text) 567 qual = Date.QUAL_NONE 568 cal = Date.CAL_GREGORIAN 569 570 (text, cal) = self.match_calendar(text, cal) 571 (text, qual) = self.match_quality(text, qual) 572 573 if self.match_span(text, cal, qual, date): 574 return 575 if self.match_range(text, cal, qual, date): 576 return 577 578 (text, bc) = self.match_bce(text) 579 if self.match_modifier(text, cal, qual, bc, date): 580 return 581 582 try: 583 subdate = self._parse_subdate(text, self.parser[cal]) 584 if subdate == Date.EMPTY and text != "": 585 date.set_as_text(text) 586 return 587 except: 588 date.set_as_text(text) 589 return 590 591 if bc: 592 date.set(qual, Date.MOD_NONE, cal, self.invert_year(subdate)) 593 else: 594 date.set(qual, Date.MOD_NONE, cal, subdate) 595 596 if date.get_slash(): 597 date.set_calendar(Date.CAL_JULIAN) 598 date.set_year(date.get_year() + 1) # year++ and forces recalc
599
600 - def invert_year(self, subdate):
601 return (subdate[0], subdate[1], -subdate[2], subdate[3])
602
603 - def parse(self, text):
604 """ 605 Parses the text, returning a Date object. 606 """ 607 new_date = Date() 608 try: 609 self.set_date(new_date, text) 610 except DateError: 611 new_date.set_as_text(text) 612 return new_date
613