# # WCPull.py - Pull trees off of WorldConnect # # Copyright (C) 2009 Robert Ham # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # import sys import threading import re import random import cgi import urllib2 import gobject import gtk from traceback import print_exc from TransUtils import sgettext as _ from HTMLParser import HTMLParser import Errors from DataViews import register, Gramplet from QuestionDialog import ErrorDialog, DBErrorDialog from gen.lib.person import Person from gen.lib.place import Place from gen.lib.event import Event from gen.lib.date import Date from gen.lib.eventtype import EventType from gen.lib.eventref import EventRef from gen.lib.attribute import Attribute from gen.lib.src import Source from gen.lib.srcref import SourceRef from gen.lib.note import Note from gen.lib.notetype import NoteType from gen.lib.family import Family from gen.lib.childref import ChildRef from gen.lib.name import Name from gen.lib.nametype import NameType from gen.lib.familyreltype import FamilyRelType WCPULL_DB_ATTRIBUTE = 'WCPull Database' URL_BASE = 'http://wc.rootsweb.ancestry.com/cgi-bin/igm.cgi' def get_anchor_href_parameter(attrs, param_name): for (name, value) in attrs: if name == 'href': BASE = '/cgi-bin/igm.cgi?' if value.startswith(BASE): params = value[len(BASE)+1:] params = [ x.split('=') for x in params.split('&') ] for (name, value) in params: if name == param_name: return value #------------------------------------------------------------------------ # # HTML filter # #------------------------------------------------------------------------ class HTMLFilter: def __init__(self, consumer): self.__consumer = consumer self.__bad_tag_re = re.compile('<([^>]+ \d+)>') self.__bad_start_re = re.compile('<([^>]+\r?\n)') def feed(self, data): data = self.__bad_tag_re.sub('<\\1>', data) data = self.__bad_start_re.sub('<\\1', data) self.__consumer.feed(data) #------------------------------------------------------------------------ # # HTMLParser with proper data handling # #------------------------------------------------------------------------ def data_is_empty(data): if data == None or len(data) < 1: return True # search for non-whitespace if re.search('\S', data) == None: return True else: return False # there's no way with HTMLParser to specify that you want data # in single chunks so we have to override every function # LOLZ@python class DataParser(HTMLParser): def __init__(self): self.__data = '' HTMLParser.__init__(self) def handle_nondata(self): data = self.__data self.__data = '' if data_is_empty(data): return self.data_handle_data(data) def handle_data(self, data): # print "Data: `" + data + "'" self.__data += data def handle_starttag(self, tag, attrs): # print "Start tag: `" + tag + "'" self.handle_nondata() self.data_handle_starttag(tag, attrs) def handle_startendtag(self, tag, attrs): self.handle_nondata() self.data_handle_startendtag(tag, attrs) def handle_endtag(self, tag): # print "End tag: `" + tag + "'" self.handle_nondata() self.data_handle_endtag(tag) def handle_charref(self, name): self.handle_nondata() self.data_handle_charret(name) def handle_entityref(self, name): self.handle_nondata() self.data_handle_entityref(name) def handle_comment(self, data): self.handle_nondata() self.data_handle_comment(data) def handle_decl(self, decl): self.handle_nondata() self.data_handle_decl(decl) def handle_pi(self, data): self.handle_nondata() self.data_handle_pi(data) def data_handle_starttag(self, tag, attrs): None def data_handle_startendtag(self, tag, attrs): None def data_handle_endtag(self, tag): None def data_handle_charref(self, name): None def data_handle_entityref(self, name): None def data_handle_comment(self, data): None def data_handle_decl(self, decl): None def data_handle_pi(self, data): None #------------------------------------------------------------------------ # # Index page parser # #------------------------------------------------------------------------ class IndexParser(DataParser): def __init__(self, db, trans, log_func, db_name): self.__db = db self.__trans = trans self.__log = log_func self.__people = [] self.__db_name = db_name self.__got_original = False self.__last_offset = None self.__next_offset = None DataParser.__init__(self) def data_handle_starttag(self, tag, attrs): if tag == 'a': self.anchor(attrs) def anchor(self, attrs): for (name, value) in attrs: if name == 'href': self.href(value) def href(self, data): BASE = '/cgi-bin/igm.cgi?' if data.startswith(BASE): params = data[len(BASE)+1:] params = [ x.split('=') for x in params.split('&') ] self.igm(params) def igm(self, params): for (param, value) in params: if param == 'id': self.add_identifier(value) return elif param == 'recno' and value != '0': # self.__log('Offset: `' + value + "'") self.__last_offset = value return def data_handle_data(self, data): if not self.__last_offset: return # self.__log("Data: `" + data + "'") if data == 'Next Page': self.__next_offset = self.__last_offset def get_next_offset(self): if not self.__got_original: return None return self.__next_offset def add_identifier(self, identifier): person = self.__db.get_person_from_gramps_id(identifier) if person == None: person = Person() person.set_gramps_id(identifier) self.__db.add_person(person, self.__trans, False) modified = True self.__got_original = True else: modified = False self.__log("WARNING: Identifier `" + identifier + "' already exists (name: `" + person.get_primary_name().get_name() + "')") # this attribute signifies an as-yet unpulled person (and stores the WC db name) existing_db_name = person_get_wcpull_db_name(person) if not existing_db_name: attr = Attribute() attr.set_type(WCPULL_DB_ATTRIBUTE) attr.set_value(self.__db_name) person.add_attribute(attr) modified = True if modified: self.__db.commit_person(person, self.__trans) #------------------------------------------------------------------------ # # Person page parser # #------------------------------------------------------------------------ def standard_event_type_names(): ev = EventType() return ev.get_standard_xml() def ignore_datum_name(name): return name in ['ID', '_UID', 'Change Data', 'CHNG', 'Reference Number', 'REFN'] class TransactionHolder: def __init__(self, db, trans): self.db = db self.__trans = trans def store_event(self, typ, event): func = getattr(self.db, 'add_' + typ + '_event') func(event, self.__trans) self.db.commit_event(event, self.__trans) def commit_object(self, typ, object): commit_func = getattr(self.db, 'commit_' + typ) commit_func(object, self.__trans) def commit_person(self, object): self.commit_object('person', object) def commit_family(self, object): self.commit_object('family', object) def store_object(self, typ, object): add_func = getattr(self.db, 'add_' + typ) add_func(object, self.__trans) self.commit_object(typ, object) def store_place(self, place): self.store_object('place', place) def store_note(self, note): self.store_object('note', note) def store_source(self, source): self.store_object('source', source) def store_family(self, family): self.store_object('family', family) class DataHolder(TransactionHolder): def __init__(self, db, trans, log_func): self.__log = log_func self.__data = [] self.__object_datum_count = 0 def capitalisations(l): l.extend([x[0:1].upper() + x[1:] for x in l]) l.extend([x.upper() for x in l]) def reify(l): return '(' + '|'.join(l) + ')' months = ['january','feburary','march','april','may','june', 'july','august','september','october','november','december'] capitalisations(months) months.extend([x[0:3] for x in months]) months = reify(months) modifiers = ['abt','about','bef','before','est','estimated'] modifiers = reify(modifiers) self.__date_re_text = '(' + modifiers + ' )?((\d{1,2} ' + months + '|' + months + ') )?\d{4}?' self.__extract_date_re = re.compile('^(.*?)(' + self.__date_re_text + ')(.*)$', re.DOTALL) self.__extract_place_re = re.compile('^in\s+(.*)$', re.DOTALL) self.__extract_date_in_re = re.compile('^(.*) in\s+(.*)$', re.DOTALL) extensions = ['jpg','gif','png','mpg','avi'] capitalisations(extensions) extensions = reify(extensions) self.__object_re = re.compile('OBJE: .*?\.' + extensions + '\n?') TransactionHolder.__init__(self, db, trans) def add_datum(self, name, value, sources): # self.__log("Got datum; name: `" + name + "', value: `" + value + "', sources: " + str(sources)) self.__data.append( (name, value, sources) ) def create_place(self, title): place = Place() place.set_title(title) self.store_place(place) return place def find_place(self, title): handles = self.db.get_place_handles() for handle in handles: place = self.db.get_place_from_handle(handle) if place.get_title() == title: return place return None def get_place(self, title): place = self.find_place(title) if place != None: return place return self.create_place(title) # returns (date, place, description) def extract_event_details(self, details): matches = self.__extract_place_re.match(details) if matches: place = matches.group(1).strip() return (None, self.get_place(place), None) matches = self.__extract_date_in_re.match(details) if matches: date = matches.group(1).strip() place = matches.group(2).strip() return (Date(date), self.get_place(place), None) date = Date(details) if date.is_valid(): return (date, None, None) matches = self.__extract_date_re.match(details) if matches: # print "Matches: " + str(matches.groups()) preamble = matches.group(1).strip() date = matches.group(2) postamble = matches.group(9).strip() # print "Preamble: `" + preamble + "'; date: `" + date + "'; postamble: `" + postamble + "'" date = Date(date) if postamble and len(postamble) > 0: place = self.get_place(postamble) else: place = None if preamble and len(preamble) > 0: description = preamble else: description = None return (date, place, description) place = self.find_place(details) if place: return (None, place, None) return (None, None, details) def finish_event(self, typ, value, source_refs, sources): event = Event() event.set_type(typ) (date, place, description) = self.extract_event_details(value) if date != None: event.set_date_object(date) if place != None: event.set_place_handle(place.get_handle()) if description != None and (typ != 'Death' or description != 'Y'): event.set_description(description) for source_ref in source_refs: if source_ref in sources: event.add_source_reference(sources[source_ref].generate()) else: self.__log("WARNING: No source `" + str(source_ref) + "' for event `" + typ + "' (value: `" + value + "')") self.store_event(self.get_event_type(), event) event_ref = EventRef() event_ref.set_reference_handle(event.get_handle()) self.finish_event_ref(event_ref) self.finish_event_local(event) def finish_attribute(self, typ, value, source_refs, sources): self.__log("WARNING: Don't know what to do with attribute `" + typ + "' (value: `" + value + "')") def finish_note(self, text, source_refs, sources): text = self.__object_re.sub('', text) if len(text) < 1: return lines = [x.strip() for x in text.split("\n")] kept_lines = [] for line in lines: stripped = line.strip() if stripped == '_NONE': continue match = re.match('^([A-Z]{4})(' + self.__date_re_text + ')$', stripped) if match: self.finish_datum(match.group(1), match.group(2), source_refs, sources) continue # match = re.match('^[^:]*:([A-Z]{4})(.*)$', stripped) # if match: # self.finish_attribute(match.group(1), match.group(2), source_refs, sources) # continue kept_lines.append(line) text = "\n".join(kept_lines) if len(text) < 1: return self.__log("WARNING: Don't know what to do with note `" + text + "'") def finish_event_datum(self, value, source_refs, sources): bits = value.split("\n", 1) if len(bits) == 2: self.finish_event('Unknown', bits[1], source_refs, sources) return self.__log("WARNING: Don't know what to do with event text `" + value + "'") def finish_datum(self, name, value, source_refs, sources): # in cummings-baggs, there are lots of object references given as notes # with the following types, in order, and 'Note' types immediately following # 'Title' if name in ['OBJE', 'FORM', 'FILE', '_SCBK', '_PRIM', '_TYPE'] \ or (name == 'Note' and self.__object_datum_count > 3) \ or (name == 'Title' and self.__object_datum_count > 3): ++self.__object_datum_count return self.__object_datum_count = 0 if name in ['ID', '_UID', 'Change Date', 'Reference Number', 'REFN', 'CHAN']: return if name in type_name_map: name = type_name_map[name] if self.finish_datum_local(name, value, source_refs, sources): return std_event_names = standard_event_type_names() if name in std_event_names: self.finish_event(name, value, source_refs, sources) return if name == 'Note': self.finish_note(value, source_refs, sources) return if name == 'Event': self.finish_event_datum(value, source_refs, sources) return self.__log("WARNING: Unknown datum name `" + name + "' (value: `" + value + "')") def finish_data(self, sources): for (name, value, source_refs) in self.__data: self.finish_datum(name, value, source_refs, sources) def add_family_parent(family, parent, other_parent): handle = parent.get_handle() gender = parent.get_gender() if gender == Person.UNKNOWN: if other_parent != None: other_gender = other_parent.get_gender() if other_gender == Person.MALE: gender = Person.FEMALE elif other_gender == Person.FEMALE: gender = Person.MALE else: raise "It went wrong" if gender == Person.FEMALE: family.set_mother_handle(handle) else: family.set_father_handle(handle) parent.add_family_handle(family.get_handle()) def add_family_child(family, child): child_ref = ChildRef() child_ref.set_reference_handle(child.get_handle()) family.add_child_ref(child_ref) child.add_parent_family_handle(family.get_handle()) class FamilyData(DataHolder): def __init__(self, db, trans, log_func): self.__log = log_func self.__spouse = None self.__children = [] DataHolder.__init__(self, db, trans, log_func) def get_event_type(self): return 'family' def add_spouse(self, identifier): self.__spouse = identifier # self.__log("Adding spouse with ID `" + identifier + "'") def add_child(self, identifier): self.__children.append(identifier) # self.__log("Adding child with ID `" + identifier + "'") def finish_event_ref(self, event_ref): self.__family.add_event_ref(event_ref) def finish_datum_local(self, name, value, source_refs, sources): return False def create_family(self, person): family = Family() self.store_family(family) spouse = None if self.__spouse: spouse = self.db.get_person_from_gramps_id(self.__spouse) add_family_parent(family, spouse, person) self.commit_person(spouse) add_family_parent(family, person, spouse) for child_id in self.__children: child = self.db.get_person_from_gramps_id(child_id) add_family_child(family, child) self.commit_person(child) return family def spouse_matches(self, family): if self.__spouse == None: return False def parent_matches(parent_type): get_func = getattr(family, 'get_' + parent_type + '_handle') handle = get_func() if handle == None: return False parent = self.db.get_person_from_handle(handle) if parent.get_gramps_id() == self.__spouse: return True return False if parent_matches('father'): return True if parent_matches('mother'): return True def children_match(self, family): child_count = len(self.__children) if child_count < 1: return False matches = 0 child_refs = family.get_child_ref_list() for child_ref in child_refs: handle = child_ref.get_reference_handle() child = self.db.get_person_from_handle(handle) if child.get_gramps_id() in self.__children: ++matches return (float(matches) / float(child_count)) > 0.75 def find_family(self, person): person = self.db.get_person_from_handle(person.get_handle()) handles = person.get_family_handle_list() for handle in handles: family = self.db.get_family_from_handle(handle) if self.spouse_matches(family): return family if self.children_match(family): return family return None def finish_event_local(self, event): if event.get_type() == EventType.MARRIAGE: self.__family.set_relationship(FamilyRelType.MARRIED) def finish(self, person, sources): if self.find_family(person): return self.__family = self.create_family(person) self.finish_data(sources) self.commit_family(self.__family) def find_source_component(source_lines, component): for (c, v) in source_lines: if v != None and c == component: return v return None SOURCE_COMPONENT_FUNCTION_MAP = { 'Title': 'title', 'Abbrev': 'abbreviation', 'Publication': 'publication_info', 'Author': 'author', 'Identifier': 'gramps_id' } SOURCE_REF_COMPONENT_FUNCTION_MAP = { 'Page': 'page', 'Quality': 'confidence_level' } class SourceRefGenerator: def __init__(self, source, ref_values): self.__source = source self.__ref_values = ref_values def generate(self): source_ref = SourceRef() for (k, v) in self.__ref_values.iteritems(): if k in SOURCE_REF_COMPONENT_FUNCTION_MAP: func = getattr(source_ref, 'set_' + SOURCE_REF_COMPONENT_FUNCTION_MAP[k]) func(v) else: # only one type: 'Date' source_ref.set_date_object(Date(v)) source_ref.set_reference_handle(self.__source.get_handle()) return source_ref def get_title(self): return self.__source.get_title() def extract_source_title(title): if re.match('^\w\d{4}\s+- ', title): (id, partition, title) = title.partition(' - ') return (title.strip(), id.strip()) return (title, None) # returns (family, given) def extract_name_components(name): (family, comma, given) = name.partition(', ') if len(comma) > 0: return (family, given) # Use capitalisation to distinguish family and given names names = name.split() lower_re = re.compile('[a-z]') roman_re = re.compile('^[IVX]+$') if lower_re.search(names[0]) and not lower_re.search(names[len(names)-1]): given = '' family = '' finished_given = False for n in names: if finished_given: family += ' ' + n else: if lower_re.search(n) or len(n) == 1 or roman_re.match(n): given += ' ' + n else: finished_given = True family += ' ' + n return (family.strip(), given.strip()) names = name.rsplit(None, 1) if len(names) > 1: return (names[1], names[0]) return (name, None) # returns (prefix, family) def extract_name_prefix(family, prefixes): no_prefix = (None, family) family_names = family.split() if len(family_names) < 2: return no_prefix # prefixes = [ ['de', 'la'], # ['de'], # ['le'] ] lower_family_names = [x.lower() for x in family_names] for prefix in prefixes: if len(lower_family_names) < len(prefix) + 1: continue match = True for i in range(0, len(prefix) - 1): if lower_family_names[i] != prefix[i].lower(): match = False break if match: return (' '.join(prefix), ' '.join(family_names[len(prefix):])) return no_prefix class PersonData(DataHolder): def __init__(self, db, trans, log_func, person): self.__trans = trans self.__log = log_func self.__person = person self.__sources = [] self.__families = [] self.__current_family = None self.__source_line_re = re.compile('^([^:\s]+): (.*)$') self.__person.get_primary_name().wcpull_component = None self.__primary_name_set = False DataHolder.__init__(self, db, trans, log_func) def get_event_type(self): return 'person' def add_person_datum(self, name, value, sources): self.add_datum(name, value, sources) def add_family_datum(self, name, value, sources): self.__current_family.add_datum(name, value, sources) def new_family(self): family = FamilyData(self.db, self.__trans, self.__log) self.__families.append(family) self.__current_family = family def add_spouse(self, identifier): self.__current_family.add_spouse(identifier) def add_child(self, identifier): self.__current_family.add_child(identifier) def add_source(self, source_lines): # self.__log("Adding source with lines:") # for line in source_lines: # self.__log(" `" + line + "'") split_lines = [] for line in source_lines: match = self.__source_line_re.match(line) if match: split_lines.append( (match.group(1), match.group(2)) ) else: split_lines.append( (line, None) ) # split_lines = [(x, z if len(z) > 0 else None) for (x, y, z) in [x.partition(': ') for x in source_lines] ] self.__sources.append(split_lines) def set_gender(self, gender): if gender in ['M', 'm']: self.__person.set_gender(Person.MALE) elif gender in ['F', 'f']: self.__person.set_gender(Person.FEMALE) elif gender in ['U', 'u']: self.__person.set_gender(Person.UNKNOWN) else: self.__log("WARNING: Unknown gender `" + gender + "'") def find_name_with_component(person, component, value): names = [person.get_primary_name()] names.extend(person.get_alternate_names()) for name in names: if component == 'Surname': if name.get_surname() == value: return name elif component == 'Given Name': if name.get_first_name() == value: return name elif component == 'Name': (family, given) = extract_name_components(value) if name.get_surname() == family and name.get_first_name() != given: return name return None def create_name(component, value): family = None given = None if component == 'Surname': family = value elif component == 'Given Name': given = None elif component == 'Name': (family, given) = extract_name_components(value) else: self.__log("WARNING: Unknown name component `" + component + "' (value: `" + value + "')") return None name = Name() name.set_surname(family) name.set_first_name(given) return name def finish_name(self, component, value, source_refs, sources): value = re.sub('\s*#\d+$', '', value) value = re.sub('\s*\(\d+\)$', '', value) value = re.sub('\s{2,}', ' ', value) family = None given = None if component == 'Surname': family = value elif component == 'Given Name': given = value elif component == 'Name': (family, given) = extract_name_components(value) # self.__log("Extracted name `" + value + "' into family `" + family + "', given `" + given + "'") else: self.__log("WARNING: Unknown name component `" + component + "' (value: `" + value + "')") return None unknowns = ['UNKNOWN', 'Unknown', '#', '?', 'Private', 'PRIVATE'] if family in unknowns: family = None if given in unknowns: given = None if not family and not given: return (prefix, family) = extract_name_prefix(family) primary = self.__person.get_primary_name() names = [primary] names.extend(self.__person.get_alternate_names()) name = None if family and not given: for n in names: if n.wcpull_component != 'Name': surname = n.get_surname() if not surname or surname == family: if not surname: n.set_surname(family) name = n break elif not family and given: for n in names: if n.wcpull_component != 'Name': first_name = n.get_first_name() if not first_name or first_name == given: if not first_name: n.set_first_name(given) name = n break else: for n in names: surname = n.get_surname() first_name = n.get_first_name() if surname and not first_name: if surname == family: name = n name.set_first_name(given) break elif not surname and first_name: if first_name == given: name = n name.set_surname(family) break elif surname and first_name: if surname == family and first_name == given: name = n break if not name: name = Name() name.set_type(NameType.UNKNOWN) name.wcpull_component = component if family: name.set_surname(family) if given: name.set_first_name(given) if primary.is_empty(): self.__person.set_primary_name(name) else: self.__person.add_alternate_name(name) for source_ref in source_refs: if source_ref in sources: name.add_source_reference(sources[source_ref].generate()) else: self.__log("WARNING: No source `" + str(source_ref) + "' for name `" + str(family) + ", " + str(given) + "'") def finish_naming(self): primary = self.__person.get_primary_name() # self.__log("Primary name: `" + primary.get_name() + "'; component: `" + str(primary.wcpull_component) + "'") if primary.wcpull_component != 'Name': return alt_names = self.__person.get_alternate_names() # self.__log("Alternate names: `" + str([x.get_name() for x in alt_names]) + "'") if len(alt_names) < 1: return # Remove any superfluous names new_alt_names = [] for name in alt_names: if name.get_first_name() != primary.get_first_name() or name.get_surname() != primary.get_surname(): new_alt_names.append(name) # else: # self.__log("Removing superfluous name `" + name.get_name() + "'") self.__person.set_alternate_names(new_alt_names) # Switch the general name if a specific one exists alt_names = new_alt_names # self.__log("New alternate names: `" + str([x.get_name() + " (component: " + str(x.wcpull_component) + ")" for x in alt_names]) + "'") index = None for name in alt_names: if name.wcpull_component != 'Name' and name.get_surname() and name.get_first_name(): index = alt_names.index(name) # self.__log("Using alternate name `" + name.get_name() + "' with index `" + str(index) + "'") break if index == None: return # self.__log("Changing primary name `" + primary.get_name() + "' for alternate with index " + str(index) + ": `" + alt_names[index].get_name() + "'") (alt_names[index], primary) = (primary, alt_names[index]) self.__person.set_primary_name(primary) self.__person.set_alternate_names(alt_names) def finish_event_ref(self, event_ref): self.__person.add_event_ref(event_ref) def finish_event_local(self, event): None def finish_datum_local(self, name, value, source_refs, sources): if name in ['Sex', 'Gender']: self.set_gender(value) return True if name.lower().find('name') != -1: self.finish_name(name, value, source_refs, sources) return True if name == 'Title': self.__person.get_primary_name.set_title(value) return True return False def find_source(self, title): handles = self.db.get_source_handles() for handle in handles: source = self.db.get_source_from_handle(handle) if source.get_title() == title: return source return None def extract_source(self, source_lines): source_components = SOURCE_COMPONENT_FUNCTION_MAP.keys() source_components.append('Text') ref_components = SOURCE_REF_COMPONENT_FUNCTION_MAP.keys() ref_components.append('Date') source_values = {} source_data = {} ref_values = {} open_text = [] for (component, value) in source_lines: if value != None: if component in source_components: source_values[component] = value elif component in ref_components: ref_values[component] = value else: self.__log("WARNING: Unknown source component `" + component + "' (value: `" + value + "'") source_data[component] = value else: open_text.append(component) # remove abbreviation if it's the same as the title if 'Abbreviation' in source_values and source_values['Abbreviation'] == source_values['Title']: del source_values['Abbreviation'] # possibly extract an ID from the title (title, identifier) = extract_source_title(source_values['Title']) if identifier != None: source_values['Title'] = title source_values['Identifier'] = identifier return (source_values, source_data, ref_values, open_text) def create_note(self, typ, text): note = Note() note.set_type(typ) note.set(text) self.store_note(note) return note def create_source(self, source_values, source_data, open_text): source = Source() for (k, v) in source_values.iteritems(): if k in SOURCE_COMPONENT_FUNCTION_MAP: # self.__log("Calling member function `set_" + SOURCE_COMPONENT_FUNCTION_MAP[k] + "' for component `" + k + "' (value: `" + v + "')") func = getattr(source, 'set_' + SOURCE_COMPONENT_FUNCTION_MAP[k]) func(v) else: # only one extra type: 'Text' source.add_note(self.create_note(NoteType.SOURCE_TEXT, v).get_handle()) source.set_data_map(source_data) for text in open_text: source.add_note(self.create_note(NoteType.UNKNOWN, text).get_handle()) self.store_source(source) return source def finish_source(self, source_lines): (source_values, source_data, ref_values, open_text) = self.extract_source(source_lines) if not 'Title' in source_values: self.__log("WARNING: Source with no title; lines:") for (component, value) in source_lines: if value != None: self.__log(" Component `" + component + "': `" + value + "'") else: self.__log(" `" + component + "'") return None source = self.find_source(source_values['Title']) if not source: source = self.create_source(source_values, source_data, open_text) return SourceRefGenerator(source, ref_values) def finish_sources(self): sources = {} index = 1 for source_lines in self.__sources: source_ref_gen = self.finish_source(source_lines) sources[index] = source_ref_gen # title = source_ref_gen.get_title() # if title != None: # self.__log("Adding source " + str(index) + ": `" + title + "'") # else: # self.__log("Source " + str(index) + " is None") index += 1 return sources def finish_families(self, sources): for family in self.__families: family.finish(self.__person, sources) def remove_attribute(self): for attr in self.__person.get_attribute_list(): if attr.get_type() == WCPULL_DB_ATTRIBUTE: self.__person.remove_attribute(attr) return def finish(self): sources = self.finish_sources() self.finish_data(sources) self.finish_families(sources) self.finish_naming() self.remove_attribute() self.commit_person(self.__person) primary = self.__person.get_primary_name() name = primary.get_name() if name and (primary.get_first_name() or primary.get_surname()): self.__log("Pulled person `" + name + "'") else: self.__log("Pulled person with ID `" + self.__person.get_gramps_id() + "'") class PersonParser(DataParser): def __init__(self, db, trans, log_func, person): self.__log = log_func self.__person = PersonData(db, trans, log_func, person) self.__state = 'start' DataParser.__init__(self) def data_handle_starttag(self, tag, attrs): name = self.__state + '_starttag' if hasattr(self, name): before = self.__state func = getattr(self, name) func(tag, attrs) # self.__log("Tag `" + tag + "'; state before: " + before + ", after: " + self.__state) def header_done_starttag(self, tag, attrs): if tag == 'div': self.__state = 'parents' elif tag == 'li': self.__state = 'person_detail' def person_detail_starttag(self, tag, attrs): if tag == 'em': self.name_start() self.__state = 'person_name_start' def person_name_start_starttag(self, tag, attrs): if tag == 'li': self.__state = 'person_detail' def person_name_end_starttag(self, tag, attrs): if tag == 'li': self.add_datum('person') self.__state = 'person_detail' elif tag == 'div': self.add_datum('person') self.__state = 'parents' elif tag == 'br': self.__value_data += "\n" elif tag == 'sup': self.value_source_start() self.__state = 'person_source' def family_starttag(self, tag, attrs): if tag == 'a': identifier = get_anchor_href_parameter(attrs, 'id') if identifier != None: self.__person.add_spouse(identifier) elif tag == 'ul': self.__state = 'family_events' def family_events_starttag(self, tag, attrs): if tag == 'li': self.__state = 'family_detail' def family_detail_starttag(self, tag, attrs): if tag == 'em': self.name_start() self.__state = 'family_name_start' def family_name_start_starttag(self, tag, attrs): if tag == 'li': self.__state = 'family_detail' def family_name_end_starttag(self, tag, attrs): if tag == 'li': self.add_datum('family') self.__state = 'family_detail' elif tag == 'br': self.__value_data += "\n" elif tag == 'sup': self.value_source_start() self.__state = 'family_source' def children_starttag(self, tag, attrs): if tag == 'a': identifier = get_anchor_href_parameter(attrs, 'id') if identifier != None: self.__person.add_child(identifier) def sources_starttag(self, tag, attrs): if tag == 'li': self.source_finish() if tag == 'br': self.source_finish_line() def data_handle_endtag(self, tag): name = self.__state + '_endtag' if hasattr(self, name): before = self.__state func = getattr(self, name) func(tag) # self.__log("Closing tag `" + tag + "'; state before: " + before + ", after: " + self.__state) def person_name_start_endtag(self, tag): if tag == 'em': self.__state = 'person_name_end' def person_source_endtag(self, tag): if tag == 'sup': self.add_value_source() self.__state = 'person_name_end' def family_endtag(self, tag): if tag == 'td': self.__state = 'end' def family_events_endtag(self, tag): if tag == 'ul': self.__state = 'children' def family_name_start_endtag(self, tag): if tag == 'em': self.__state = 'family_name_end' def family_name_end_endtag(self, tag): if tag == 'ul': self.add_datum('family') self.__state = 'children' def family_source_endtag(self, tag): if tag == 'sup': self.add_value_source() self.__state = 'family_name_end' def children_endtag(self, tag): if tag == 'ol': self.__state = 'family' elif tag == 'td': self.__state = 'end' def sources_endtag(self, tag): if tag == 'td': self.source_finish() self.__state = 'end' def data_handle_data(self, data): name = self.__state + '_data' if hasattr(self, name): before = self.__state func = getattr(self, name) func(data) # self.__log("Data `" + data.strip() + "'; state before: " + before + ", after: " + self.__state) def start_data(self, data): if data == 'Add Post-em': self.__state = 'header_done' def person_name_start_data(self, data): self.name_start_data(data) def person_name_end_data(self, data): self.name_end_data(data) def person_source_data(self, data): self.value_source_data(data) def parents_data(self, data): if data == 'Marriage': self.__person.new_family() self.__state = 'family' elif data == 'Sources:': self.sources_start() self.__state = 'sources' def family_data(self, data): if data == 'Marriage': self.__person.new_family() elif data == 'Sources:': self.sources_start() self.__state = 'sources' def children_data(self, data): if data == 'Sources:': self.sources_start() self.__state = 'sources' def family_name_start_data(self, data): self.name_start_data(data) def family_name_end_data(self, data): self.name_end_data(data) def family_source_data(self, data): self.value_source_data(data) def sources_data(self, data): self.__source += data def name_start(self): self.__value_name = '' self.__value_data = '' self.__value_sources = [] def name_start_data(self, data): self.__value_name += data def name_end_data(self, data): self.__value_data += data def value_source_start(self): self.__value_source = '' def value_source_data(self, data): self.__value_source += data def add_value_source(self): self.__value_sources.append(self.__value_source) self.__value_source = None def sources_start(self): self.__source = '' self.__source_lines = [] def source_finish_line(self): if len(self.__source) > 0: self.__source_lines.append(self.__source.strip()) self.__source = '' def source_finish(self): self.source_finish_line() if len(self.__source_lines) > 0: self.__person.add_source(self.__source_lines) self.__source_lines = [] def add_datum(self, typ): func = getattr(self.__person, 'add_' + typ + '_datum') func(self.__value_name.strip(':'), self.__value_data.strip(), [int(x) for x in self.__value_sources]) self.__value_name = None self.__value_data = None self.__value_sources = None def finish(self): self.__person.finish() #------------------------------------------------------------------------ # # WCPull gramplet # #------------------------------------------------------------------------ def person_get_wcpull_db_name(person): for attr in person.get_attribute_list(): if attr.get_type() == WCPULL_DB_ATTRIBUTE: return attr.get_value() return None def parse_url(URL, parser): fil = HTMLFilter(parser) req = urllib2.Request(url = URL) req.add_header('User-agent', 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.11) Gecko/2009061208 Iceweasel/3.0.12 (Debian-3.0.12-1)') f = urllib2.urlopen(req) params = [(x, z) for (x, y, z) in [x.partition('=') for x in f.info().getplist()]] charset = None for param, value in params: if param == 'charset': charset = value break read_size = 512 data = f.read(read_size) while len(data) > 0: if charset: data = data.decode(charset).encode('utf8') fil.feed(data) data = f.read(read_size) f.close() class PersonPuller(threading.Thread): _STOP = 0 _RUN = 1 def __init__(self, db, log_func, finished_func): self.__db = db self.__log = log_func self.__finished = finished_func self.__state_lock = threading.Lock() self.__state = self._STOP threading.Thread.__init__(self) def get_run_state(self): self.__state_lock.acquire() state = self.__state self.__state_lock.release() # print "Got run state of " + str(state) return state def set_run_state(self, state): self.__state_lock.acquire() # print "Setting run state to " + str(state) self.__state = state self.__state_lock.release() def stop(self): self.set_run_state(self._STOP) def run(self): # print "Starting run" self.set_run_state(self._RUN) transaction = self.__db.transaction_begin("", batch=True) handles = self.__db.get_person_handles() random.shuffle(handles) for handle in handles: person = self.__db.get_person_from_handle(handle) db_name = person_get_wcpull_db_name(person) if db_name != None: self.pull_person(transaction, db_name, person) if self.get_run_state() == self._STOP: break self.__db.transaction_commit(transaction, "Pull people from WorldConnect databases") self.__finished() def pull_person(self, trans, db_name, person): identifier = person.get_gramps_id() # if not identifier in ['I647739306','I647739246']: # return # if not identifier == 'I647739392': # return # self.__log("Pulling person with ID `" + identifier + "' from database `" + db_name + "'") url = URL_BASE + '?op=GET&db=' + db_name + '&id=' + identifier parser = PersonParser(self.__db, trans, self.__log, person) parse_url(url, parser) parser.finish() class WCPull(Gramplet): def init(self): self.__log_lock = threading.Lock() self.__log = [] self.__pull_lock = threading.Lock() self.__puller = None self.__pull_finished = False main_box = gtk.VBox() log_window = gtk.ScrolledWindow() main_box.pack_start(log_window, True) log_window.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) self.__log_view = gtk.TextView() log_window.add(self.__log_view) self.__log_view.set_editable(False) self.__log_text = self.__log_view.get_buffer() entry_box = gtk.HBox() main_box.pack_start(entry_box, False) self.__db_name = gtk.Entry() entry_box.pack_start(self.__db_name, True) self.__db_name.set_text('20070710_csl') button = gtk.Button(_("Pull index")) button.connect("clicked", self.pull_clicked) entry_box.pack_start(button, False) person_box = gtk.HBox() main_box.pack_start(person_box, False) self.__pull_button = gtk.ToggleButton(_("Pull people")) self.__pull_button.connect("toggled", self.pull_people_toggled) person_box.pack_start(self.__pull_button, True) self.gui.get_container_widget().remove(self.gui.textview) self.gui.get_container_widget().add_with_viewport(main_box) main_box.show_all() gobject.timeout_add(100, self.thread_checker) def pull_clicked(self, obj): name = self.__db_name.get_text() if name != None and len(name) > 1: try: self.pull_db_index(name) except Errors.DbError, msg: DBErrorDialog(str(msg.value)) except: print_exc() ErrorDialog("Error pulling WorldConnect database") def pull_people_toggled(self, obj): if obj.get_active(): self.start_pulling_people() else: self.stop_pulling_people() def start_pulling_people(self): self.__pull_lock.acquire() if self.__puller != None: self.__pull_lock.release() return puller = PersonPuller(self.dbstate.db, self.log, self.pull_finished) self.__puller = puller self.__pull_lock.release() puller.start() def stop_pulling_people(self): self.__pull_lock.acquire() if self.__puller == None: self.__pull_lock.release() return puller = self.__puller self.__puller = None self.__pull_lock.release() puller.stop() puller.join() def pull_finished(self): self.__pull_lock.acquire() if self.__puller: self.__pull_finished = True self.__pull_lock.release() def pull_check(self): self.__pull_lock.acquire() if not self.__pull_finished: self.__pull_lock.release() return self.__puller = None self.__pull_finished = False self.__pull_lock.release() self.__pull_button.set_active(False) def thread_checker(self): self.pull_check() self.log_read() return True def log_read(self): self.__log_lock.acquire() for string in self.__log: # sys.stderr.write("Log text: `" + string + "'\n") end = self.__log_text.get_end_iter() self.__log_text.insert(end, string + "\n") self.__log_view.scroll_to_iter(end, 0.0) self.__log = [] self.__log_lock.release() while gtk.events_pending(): gtk.main_iteration() def log(self, string): self.__log_lock.acquire() self.__log.append(string) self.__log_lock.release() def log_local(self, string): self.log(string) self.log_read() def pull_db_index(self, db_name): self.log_local("Pulling index for database `" + db_name + "'") transaction = self.dbstate.db.transaction_begin("", batch=True) index_offset = self.pull_index(transaction, db_name, 0) # index_offset = self.pull_index(transaction, db_name, 2121) while index_offset != None: index_offset = self.pull_index(transaction, db_name, index_offset) self.log_local("Pulled index for database `" + db_name + "'") self.dbstate.db.transaction_commit(transaction, "Pull index for WorldConnect database `" + db_name + "'") def pull_index(self, trans, db_name, index_offset): self.log_local('Pulling index at record offset ' + str(index_offset)) url = URL_BASE + '?op=SHOW&db=' + db_name + '&recno=' + str(index_offset) parser = IndexParser(self.dbstate.db, trans, self.log_local, db_name) parse_url(url, parser) return parser.get_next_offset() type_name_map = { 'Married': 'Marriage' } # import gedcom types into the type name map import GrampsDbUtils._GedcomInfo def gedcom_map_append(map_name, name_map): mp = getattr(GrampsDbUtils._GedcomInfo, map_name + 'ConstantEvents') reversed = [(y, EventType(x).xml_str()) for (x, y) in mp.items()] name_map.update(reversed) gedcom_map_append('family', type_name_map) gedcom_map_append('personal', type_name_map) if "" in type_name_map: del type_name_map[""] register(type="gramplet", name="WorldConnect Gramplet", tname=_("WorldConnect Gramplet"), height = 800, content = WCPull, title="WorldConnect", )