1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 """
25 Date parsing class. Serves as the base class for any localized
26 date parsing class. The default, base class provides parsing for
27 English.
28 """
29
30
31
32
33
34
35 import re
36 import calendar
37
38
39
40
41
42
43 import logging
44 log = logging.getLogger(".DateParser")
45
46
47
48
49
50
51 from gen.lib import Date, DateError
52 import GrampsLocale
53
54
55
56
57
58
59 _max_days = [ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ]
60 _leap_days = [ 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ]
61
63 day = date_tuple[0]
64 month = date_tuple[1]
65 valid = True
66 try:
67 if month > 12:
68 valid = False
69 elif calendar.isleap(date_tuple[2]):
70 if day > _leap_days[month-1]:
71 valid = False
72 elif day > _max_days[month-1]:
73 valid = False
74 except:
75 valid = False
76 return valid
77
78
79
80
81
82
84 """
85 Convert a text string into a Date object. If the date cannot be
86 converted, the text string is assigned.
87 """
88
89 _fmt_parse = re.compile(".*%(\S).*%(\S).*%(\S).*")
90
91
92 _rfc_days = ('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')
93 _rfc_mons_to_int = {
94 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4,
95 'May' : 5, 'Jun' : 6, 'Jul' : 7, 'Aug' : 8,
96 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12,
97 }
98
99 month_to_int = GrampsLocale.month_to_int
100
101
102 modifier_to_int = {
103 'before' : Date.MOD_BEFORE, 'bef' : Date.MOD_BEFORE,
104 'bef.' : Date.MOD_BEFORE, 'after' : Date.MOD_AFTER,
105 'aft' : Date.MOD_AFTER, 'aft.' : Date.MOD_AFTER,
106 'about' : Date.MOD_ABOUT, 'abt.' : Date.MOD_ABOUT,
107 'abt' : Date.MOD_ABOUT, 'circa' : Date.MOD_ABOUT,
108 'c.' : Date.MOD_ABOUT, 'around' : Date.MOD_ABOUT,
109 }
110
111
112 modifier_after_to_int = {}
113
114 hebrew_to_int = {
115 "tishri" : 1, "heshvan" : 2, "kislev" : 3,
116 "tevet" : 4, "shevat" : 5, "adari" : 6,
117 "adarii" : 7, "nisan" : 8, "iyyar" : 9,
118 "sivan" : 10, "tammuz" : 11, "av" : 12,
119 "elul" : 13,
120 }
121
122 french_to_int = {
123 u'vendémiaire' : 1, u'brumaire' : 2,
124 u'frimaire' : 3, u'nivôse': 4,
125 u'pluviôse' : 5, u'ventôse' : 6,
126 u'germinal' : 7, u'floréal' : 8,
127 u'prairial' : 9, u'messidor' : 10,
128 u'thermidor' : 11, u'fructidor' : 12,
129 u'extra' : 13
130 }
131
132 islamic_to_int = {
133 "muharram" : 1, "muharram ul haram" : 1,
134 "safar" : 2, "rabi`al-awwal" : 3,
135 "rabi'l" : 3, "rabi`ul-akhir" : 4,
136 "rabi`ath-thani" : 4, "rabi` ath-thani" : 4,
137 "rabi`al-thaany" : 4, "rabi` al-thaany" : 4,
138 "rabi' ii" : 4, "jumada l-ula" : 5,
139 "jumaada-ul-awwal" : 5, "jumaada i" : 5,
140 "jumada t-tania" : 6, "jumaada-ul-akhir" : 6,
141 "jumaada al-thaany" : 6, "jumaada ii" : 5,
142 "rajab" : 7, "sha`ban" : 8,
143 "sha`aban" : 8, "ramadan" : 9,
144 "ramadhan" : 9, "shawwal" : 10,
145 "dhu l-qa`da" : 11, "dhu qadah" : 11,
146 "thw al-qi`dah" : 11, "dhu l-hijja" : 12,
147 "dhu hijja" : 12, "thw al-hijjah" : 12,
148 }
149
150 persian_to_int = {
151 "farvardin" : 1, "ordibehesht" : 2,
152 "khordad" : 3, "tir" : 4,
153 "mordad" : 5, "shahrivar" : 6,
154 "mehr" : 7, "aban" : 8,
155 "azar" : 9, "dey" : 10,
156 "bahman" : 11, "esfand" : 12,
157 }
158
159 bce = ["B.C.E.", "B.C.E", "BCE", "B.C.", "B.C", "BC" ]
160
161 calendar_to_int = {
162 'gregorian' : Date.CAL_GREGORIAN,
163 'g' : Date.CAL_GREGORIAN,
164 'julian' : Date.CAL_JULIAN,
165 'j' : Date.CAL_JULIAN,
166 'hebrew' : Date.CAL_HEBREW,
167 'h' : Date.CAL_HEBREW,
168 'islamic' : Date.CAL_ISLAMIC,
169 'i' : Date.CAL_ISLAMIC,
170 'french' : Date.CAL_FRENCH,
171 'french republican': Date.CAL_FRENCH,
172 'f' : Date.CAL_FRENCH,
173 'persian' : Date.CAL_PERSIAN,
174 'p' : Date.CAL_PERSIAN,
175 }
176
177 quality_to_int = {
178 'estimated' : Date.QUAL_ESTIMATED,
179 'est.' : Date.QUAL_ESTIMATED,
180 'est' : Date.QUAL_ESTIMATED,
181 'calc.' : Date.QUAL_CALCULATED,
182 'calc' : Date.QUAL_CALCULATED,
183 'calculated' : Date.QUAL_CALCULATED,
184 }
185
187 self.init_strings()
188 self.parser = {
189 Date.CAL_GREGORIAN : self._parse_greg_julian,
190 Date.CAL_JULIAN : self._parse_greg_julian,
191 Date.CAL_FRENCH : self._parse_french,
192 Date.CAL_PERSIAN : self._parse_persian,
193 Date.CAL_HEBREW : self._parse_hebrew,
194 Date.CAL_ISLAMIC : self._parse_islamic,
195 }
196
197 fmt = GrampsLocale.tformat
198 match = self._fmt_parse.match(fmt.lower())
199 if match:
200 self.dmy = (match.groups() == ('d', 'm', 'y'))
201 self.ymd = (match.groups() == ('y', 'm', 'd'))
202 else:
203 self.dmy = True
204 self.ymd = False
205
207 """
208 returns a string for a RE group which contains the given keys
209 sorted so that longest keys match first. Any '.' characters
210 are quoted.
211 """
212 keys.sort(lambda x, y: cmp(len(y), len(x)))
213 return '(' + '|'.join([key.replace('.', '\.') for key in keys]) + ')'
214
216 """
217 This method compiles regular expression strings for matching dates.
218
219 Most of the re's in most languages can stay as is. span and range
220 most likely will need to change. Whatever change is done, this method
221 may be called first as DateParser.init_strings(self) so that the
222 invariant expresions don't need to be repeteadly coded. All differences
223 can be coded after DateParser.init_strings(self) call, that way they
224 override stuff from this method. See DateParserRU() as an example.
225 """
226 self._rfc_mon_str = '(' + '|'.join(self._rfc_mons_to_int.keys()) + ')'
227 self._rfc_day_str = '(' + '|'.join(self._rfc_days) + ')'
228
229 self._bce_str = self.re_longest_first(self.bce)
230 self._qual_str = self.re_longest_first(self.quality_to_int.keys())
231 self._mod_str = self.re_longest_first(self.modifier_to_int.keys())
232 self._mod_after_str = self.re_longest_first(
233 self.modifier_after_to_int.keys())
234
235 self._mon_str = self.re_longest_first(self.month_to_int.keys())
236 self._jmon_str = self.re_longest_first(self.hebrew_to_int.keys())
237 self._fmon_str = self.re_longest_first(self.french_to_int.keys())
238 self._pmon_str = self.re_longest_first(self.persian_to_int.keys())
239 self._imon_str = self.re_longest_first(self.islamic_to_int.keys())
240 self._cal_str = self.re_longest_first(self.calendar_to_int.keys())
241
242
243
244
245 self._bce_re = re.compile("(.*)\s+%s( ?.*)" % self._bce_str)
246
247 self._cal = re.compile("(.*)\s+\(%s\)( ?.*)" % self._cal_str,
248 re.IGNORECASE)
249 self._qual = re.compile("(.* ?)%s\s+(.+)" % self._qual_str,
250 re.IGNORECASE)
251
252 self._span = re.compile("(from)\s+(?P<start>.+)\s+to\s+(?P<stop>.+)",
253 re.IGNORECASE)
254 self._range = re.compile("(bet|bet.|between)\s+(?P<start>.+)\s+and\s+(?P<stop>.+)",
255 re.IGNORECASE)
256 self._modifier = re.compile('%s\s+(.*)' % self._mod_str,
257 re.IGNORECASE)
258 self._modifier_after = re.compile('(.*)\s+%s' % self._mod_after_str,
259 re.IGNORECASE)
260 self._abt2 = re.compile('<(.*)>', re.IGNORECASE)
261 self._text = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._mon_str,
262 re.IGNORECASE)
263 self._text2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._mon_str,
264 re.IGNORECASE)
265 self._jtext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._jmon_str,
266 re.IGNORECASE)
267 self._jtext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._jmon_str,
268 re.IGNORECASE)
269 self._ftext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._fmon_str,
270 re.IGNORECASE)
271 self._ftext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._fmon_str,
272 re.IGNORECASE)
273 self._ptext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._pmon_str,
274 re.IGNORECASE)
275 self._ptext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._pmon_str,
276 re.IGNORECASE)
277 self._itext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._imon_str,
278 re.IGNORECASE)
279 self._itext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._imon_str,
280 re.IGNORECASE)
281 self._numeric = re.compile("((\d+)[/\.]\s*)?((\d+)[/\.]\s*)?(\d+)\s*$")
282 self._iso = re.compile("(\d+)(/(\d+))?-(\d+)-(\d+)\s*$")
283 self._rfc = re.compile("(%s,)?\s+(\d|\d\d)\s+%s\s+(\d+)\s+\d\d:\d\d(:\d\d)?\s+(\+|-)\d\d\d\d"
284 % (self._rfc_day_str, self._rfc_mon_str))
285
287 """
288 Convert the string to an integer if the value is not None. If the
289 value is None, a zero is returned
290 """
291 if val == None:
292 return 0
293 else:
294 return int(val)
295
297 return self._parse_calendar(text, self._jtext, self._jtext2,
298 self.hebrew_to_int)
299
301 return self._parse_calendar(text, self._itext, self._itext2,
302 self.islamic_to_int)
303
305 return self._parse_calendar(text, self._ptext, self._ptext2,
306 self.persian_to_int)
307
309 return self._parse_calendar(text, self._ftext, self._ftext2,
310 self.french_to_int)
311
315
317 match = regex1.match(text.lower())
318 if match:
319 groups = match.groups()
320 if groups[0] == None:
321 m = 0
322 else:
323 m = mmap[groups[0].lower()]
324
325 if groups[2] == None:
326 y = self._get_int(groups[1])
327 d = 0
328 s = False
329 else:
330 d = self._get_int(groups[1])
331 y = int(groups[3])
332 s = groups[4] != None
333 value = (d, m, y, s)
334 if check and not check((d, m, y)):
335 value = Date.EMPTY
336 return value
337
338 match = regex2.match(text.lower())
339 if match:
340 groups = match.groups()
341 if groups[1] == None:
342 m = 0
343 else:
344 m = mmap[groups[1].lower()]
345
346 d = self._get_int(groups[0])
347
348 if groups[2] == None:
349 y = None
350 s = False
351 else:
352 y = int(groups[3])
353 s = groups[4] != None
354 value = (d, m, y, s)
355 if check and not check((d, m, y)):
356 value = Date.EMPTY
357 return value
358
359 return Date.EMPTY
360
362 """
363 Convert only the date portion of a date.
364 """
365 if subparser == None:
366 subparser = self._parse_greg_julian
367
368 if subparser == self._parse_greg_julian:
369 check = gregorian_valid
370 else:
371 check = None
372
373 value = subparser(text)
374 if value != Date.EMPTY:
375 return value
376
377 match = self._iso.match(text)
378 if match:
379 groups = match.groups()
380 y = self._get_int(groups[0])
381 m = self._get_int(groups[3])
382 d = self._get_int(groups[4])
383 if check and not check((d, m, y)):
384 return Date.EMPTY
385 if groups[2]:
386 return (d, m, y, True)
387 else:
388 return (d, m, y, False)
389
390 match = self._rfc.match(text)
391 if match:
392 groups = match.groups()
393 d = self._get_int(groups[2])
394 m = self._rfc_mons_to_int[groups[3]]
395 y = self._get_int(groups[4])
396 value = (d, m, y, False)
397 if check and not check((d, m, y)):
398 value = Date.EMPTY
399 return value
400
401 match = self._numeric.match(text)
402 if match:
403 groups = match.groups()
404 if self.ymd:
405
406 if groups[1] == None:
407 y = self._get_int(groups[4])
408 m = 0
409 d = 0
410 else:
411 y = self._get_int(groups[1])
412 m = self._get_int(groups[3])
413 d = self._get_int(groups[4])
414 else:
415 y = self._get_int(groups[4])
416 if self.dmy:
417 m = self._get_int(groups[3])
418 d = self._get_int(groups[1])
419 else:
420 m = self._get_int(groups[1])
421 d = self._get_int(groups[3])
422 value = (d, m, y, False)
423 if check and not check((d, m, y)):
424 value = Date.EMPTY
425 return value
426
427 return Date.EMPTY
428
430 """
431 Try parsing calendar.
432
433 Return calendar index and the text with calendar removed.
434 """
435 match = self._cal.match(text)
436 if match:
437 cal = self.calendar_to_int[match.group(2).lower()]
438 text = match.group(1) + match.group(3)
439 return (text, cal)
440
442 """
443 Try matching quality.
444
445 Return quality index and the text with quality removed.
446 """
447 match = self._qual.match(text)
448 if match:
449 qual = self.quality_to_int[match.group(2).lower()]
450 text = match.group(1) + match.group(3)
451 return (text, qual)
452
454 """
455 Try matching span date.
456
457 On success, set the date and return 1. On failure return 0.
458 """
459 match = self._span.match(text)
460 if match:
461 text_parser = self.parser[cal]
462 (text1, bc1) = self.match_bce(match.group('start'))
463 start = self._parse_subdate(text1, text_parser)
464 if bc1:
465 start = self.invert_year(start)
466
467 (text2, bc2) = self.match_bce(match.group('stop'))
468 stop = self._parse_subdate(text2, text_parser)
469 if bc2:
470 stop = self.invert_year(stop)
471
472 date.set(qual, Date.MOD_SPAN, cal, start + stop)
473 return 1
474 return 0
475
477 """
478 Try matching range date.
479
480 On success, set the date and return 1. On failure return 0.
481 """
482 match = self._range.match(text)
483 if match:
484 text_parser = self.parser[cal]
485 (text1, bc1) = self.match_bce(match.group('start'))
486 start = self._parse_subdate(text1, text_parser)
487 if bc1:
488 start = self.invert_year(start)
489
490 (text2, bc2) = self.match_bce(match.group('stop'))
491 stop = self._parse_subdate(text2, text_parser)
492 if bc2:
493 stop = self.invert_year(stop)
494
495 date.set(qual, Date.MOD_RANGE, cal, start + stop)
496 return 1
497 return 0
498
500 """
501 Try matching BCE qualifier.
502
503 Return BCE (True/False) and the text with matched part removed.
504 """
505 match = self._bce_re.match(text)
506 bc = False
507 if match:
508
509 try:
510 text = match.group(1) + match.group(3)
511 except:
512 print "MATCH:", match.groups()
513 bc = True
514 return (text, bc)
515
517 """
518 Try matching date with modifier.
519
520 On success, set the date and return 1. On failure return 0.
521 """
522
523 match = self._modifier.match(text)
524 if match:
525 grps = match.groups()
526 start = self._parse_subdate(grps[1], self.parser[cal])
527 mod = self.modifier_to_int.get(grps[0].lower(), Date.MOD_NONE)
528 if start == Date.EMPTY:
529 date.set_modifier(Date.MOD_TEXTONLY)
530 date.set_text_value(text)
531 elif bc:
532 date.set(qual, mod, cal, self.invert_year(start))
533 else:
534 date.set(qual, mod, cal, start)
535 return True
536
537 if self.modifier_after_to_int:
538 match = self._modifier_after.match(text)
539 if match:
540 grps = match.groups()
541 start = self._parse_subdate(grps[0], self.parser[cal])
542 mod = self.modifier_after_to_int.get(grps[1].lower(),
543 Date.MOD_NONE)
544 if bc:
545 date.set(qual, mod, cal, self.invert_year(start))
546 else:
547 date.set(qual, mod, cal, start)
548 return True
549 match = self._abt2.match(text)
550 if match:
551 grps = match.groups()
552 start = self._parse_subdate(grps[0])
553 mod = Date.MOD_ABOUT
554 if bc:
555 date.set(qual, mod, cal, self.invert_year(start))
556 else:
557 date.set(qual, mod, cal, start)
558 return True
559 return False
560
562 """
563 Parses the text and sets the date according to the parsing.
564 """
565
566 date.set_text_value(text)
567 qual = Date.QUAL_NONE
568 cal = Date.CAL_GREGORIAN
569
570 (text, cal) = self.match_calendar(text, cal)
571 (text, qual) = self.match_quality(text, qual)
572
573 if self.match_span(text, cal, qual, date):
574 return
575 if self.match_range(text, cal, qual, date):
576 return
577
578 (text, bc) = self.match_bce(text)
579 if self.match_modifier(text, cal, qual, bc, date):
580 return
581
582 try:
583 subdate = self._parse_subdate(text, self.parser[cal])
584 if subdate == Date.EMPTY and text != "":
585 date.set_as_text(text)
586 return
587 except:
588 date.set_as_text(text)
589 return
590
591 if bc:
592 date.set(qual, Date.MOD_NONE, cal, self.invert_year(subdate))
593 else:
594 date.set(qual, Date.MOD_NONE, cal, subdate)
595
596 if date.get_slash():
597 date.set_calendar(Date.CAL_JULIAN)
598 date.set_year(date.get_year() + 1)
599
601 return (subdate[0], subdate[1], -subdate[2], subdate[3])
602
604 """
605 Parses the text, returning a Date object.
606 """
607 new_date = Date()
608 try:
609 self.set_date(new_date, text)
610 except DateError:
611 new_date.set_as_text(text)
612 return new_date
613