from omgeo.processor import _Processor
import math
from operator import attrgetter
class _PostProcessor(_Processor):
"""Takes, processes, and returns list of geocoding.places.Candidate objects."""
def process(self, candidates):
raise NotImplementedError(
'PostProcessor subclasses must implement process().')
def is_case_sensitive(self):
try:
return 'CS' if self.case_sensitive else 'CI'
except ValueError:
return 'NA'
def is_exact(self):
try:
return 'EXACT_MATCH' if self.exact_match else 'INEXACT_MATCH'
except ValueError:
return 'NA'
[docs]class LocatorFilter(_PostProcessor):
"""
PostProcessor used to ditch results with lousy locators.
:arg list good_locators: A list of locators to accept results from (default [])
"""
def __init__(self, good_locators):
"""
:arg list good_locators: A list of locators to accept results from (default None)
"""
self.good_locators = good_locators
[docs] def process(self, candidates):
"""
:arg list candidates: list of Candidate instances
"""
for c in candidates[:]:
if c.locator not in self.good_locators:
# TODO: search string, i.e. find "EU_Street_Name" in "EU_Street_Name.GBR_StreetName"
candidates.remove(c)
return candidates
def __repr__(self):
return '<%s: %s>' % (self.__class__.__name__, self.good_locators)
[docs]class LocatorSorter(_PostProcessor):
"""
PostProcessor used to sort by locators
"""
def __init__(self, ordered_locators):
"""
:arg list ordered_locators: a list of :py:attr:`Candidate.locator` values
placed in the desired order, such as ``rooftop``,
``interpolation``, or ``postal``.
"""
self.ordered_locators = ordered_locators
[docs] def process(self, unordered_candidates):
"""
:arg list candidates: list of Candidate instances
"""
ordered_candidates = []
# make a new list of candidates in order of ordered_locators
for locator in self.ordered_locators:
for uc in unordered_candidates[:]:
if uc.locator == locator:
ordered_candidates.append(uc)
unordered_candidates.remove(uc)
# add all the candidates that are still left
# (whose locator values are not in ordered_locators)
# and return the new list
return ordered_candidates + unordered_candidates
def __repr__(self):
return '<%s: %s>' % (self.__class__.__name__, self.ordered_locators)
[docs]class AttrRename(_PostProcessor):
"""
PostProcessor used to rename the given attribute, with unspecified
attributes appearing at the end of the list.
"""
def __init__(self, attr, attr_map=None, exact_match=False, case_sensitive=False):
"""
:arg str attr: Name of the attribute
:arg dict attr_map: Map of old names : new names.
:arg bool exact_match:
:arg bool case_sensitive:
"""
self.attr = attr
self.attr_map = attr_map if attr_map is not None else {}
self.exact_match = exact_match
self.case_sensitive = case_sensitive
[docs] def process(self, candidates):
"""
:arg list candidates: list of Candidate instances
:returns: list of Candidate instances with modified values for the given attribute
"""
def _cc(str_): # change case
return str_ if self.case_sensitive else str_.lower()
new_candidates = []
for c in candidates[:]:
attr_val = getattr(c, self.attr)
if self.exact_match is False and any(_cc(k) in _cc(attr_val) for k in self.attr_map):
map_key = [k for k in self.attr_map if _cc(k) in _cc(attr_val)][0]
map_val = self.attr_map[map_key]
setattr(c, self.attr, map_val)
elif _cc(attr_val) in [_cc(a) for a in self.attr_map]:
map_key = [k for k in self.attr_map if _cc(k) == _cc(attr_val)][0]
setattr(c, self.attr, self.attr_map[map_key])
new_candidates.append(c)
return new_candidates
def __repr__(self):
return '<%s: %s %s Map of %s (old:new): %s>' \
% (self.__class__.__name__, self.is_exact(), self.is_case_sensitive(), self.attr, self.attr_map)
[docs]class UseHighScoreIfAtLeast(_PostProcessor):
"""
Limit results to results with at least the given score,
if and only if one or more results has, at least, the
given score. If no results have at least this score,
all of the original results are returned intact.
"""
def __init__(self, min_score):
self.min_score = min_score
[docs] def process(self, candidates):
"""
:arg list candidates: list of Candidates
:returns: list of Candidates where score is at least min_score,
if and only if one or more Candidates have at least min_score.
Otherwise, returns original list of Candidates.
"""
high_score_candidates = [c for c in candidates if c.score >= self.min_score]
if high_score_candidates != []:
return high_score_candidates
return candidates
def __repr__(self):
return '<%s: %s>' % (self.__class__.__name__, self.min_score)
[docs]class ScoreSorter(_PostProcessor):
"""PostProcessor class to sort :py:class:`Candidate` scores."""
def __init__(self, reverse=True):
"""
:arg bool reverse: indicates if the scores should be sorted in descending
order (e.g. 100, 90, 80, ...) (default ``True``)
"""
self.reverse = reverse
[docs] def process(self, candidates):
"""
:arg list candidates: list of Candidates
:returns: score-sorted list of Candidates
"""
return sorted(candidates, key=attrgetter('score'), reverse=self.reverse)
def __repr__(self):
order = 'high to low' if self.reverse else 'low to high'
return '<%s: %s>' % (self.__class__.__name__, order)
[docs]class AttrSorter(_PostProcessor):
"""
PostProcessor used to sort by a the given attribute, with unspecified
attributes appearing at the end of the list.
:arg list ordered_values: A list of values placed in the desired order.
:arg str attr: The attribute on which to sort.
"""
def __init__(self, ordered_values=None, attr='locator'):
self.ordered_values = [] if ordered_values is None else ordered_values
self.attr = attr
def process(self, unordered_candidates):
ordered_candidates = []
# make a new list of candidates in order of ordered_values
for value in self.ordered_values:
for uc in unordered_candidates[:]:
if getattr(uc, self.attr) == value:
ordered_candidates.append(uc)
unordered_candidates.remove(uc)
# add all the candidates that are still left
# and return the new list
return ordered_candidates + unordered_candidates
def __repr__(self):
return '<%s: %s sorted by %s>' % \
(self.__class__.__name__, self.attr, self.ordered_values)
[docs]class AttrReverseSorter(_PostProcessor):
"""
PostProcessor used to sort by the given attributes in reverse order,
with unspecified attributes appearing at the end of the list.
This is good to use when a list has already been defined in a script
and you are too lazy to use the reverse() function, or don't want
to in order to maintain more readable code.
"""
def __init__(self, ordered_values=None, attr='locator'):
"""
:arg list ordered_values: A list of values placed in the reverse of the desired order.
:arg str attribute: The attribute on which to sort
"""
self.ordered_values = [] if ordered_values is None else ordered_values
self.attr = attr
def process(self, unordered_candidates):
ordered_values = self.ordered_values
ordered_values.reverse()
sorter = AttrSorter(ordered_values, self.attr)
return sorter.process(unordered_candidates)
def __repr__(self):
return '<%s: %s reverse sorted by %s>' % \
(self.__class__.__name__, self.attr, self.ordered_values)
[docs]class AttrMigrator(_PostProcessor):
"""
PostProcessor used to migrate the given attribute
to another attribute.
"""
def __init__(self, attr_from, attr_to, attr_map=None, exact_match=False, case_sensitive=False):
self.attr_from = attr_from
self.attr_to = attr_to
self.attr_map = {} if attr_map is None else attr_map
self.exact_match = exact_match
self.case_sensitive = case_sensitive
def process(self, candidates):
def _cc(str_): # change case
if self.case_sensitive is False:
return str_.lower()
return str_
new_candidates = []
for c in candidates[:]:
from_val = getattr(c, self.attr_from)
if self.exact_match is False and any(_cc(k) in _cc(from_val) for k in self.attr_map):
map_key = [k for k in self.attr_map if _cc(k) in _cc(from_val)][0]
map_val = self.attr_map[map_key]
setattr(c, self.attr_to, map_val)
elif _cc(from_val) in [_cc(a) for a in self.attr_map]:
map_key = [k for k in self.attr_map if _cc(k) == _cc(from_val)][0]
setattr(c, self.attr_to, self.attr_map[map_key])
new_candidates.append(c)
return new_candidates
def __repr__(self):
return '<%s: %s -> %s %s %s>' % \
(self.__class__.__name__, self.attr_from, self.attr_to, self.is_exact(), self.is_case_sensitive())
[docs]class AttrFilter(_PostProcessor):
"""
PostProcessor used to filter out results without desired attribute values.
"""
def __init__(self, good_values=[], attr='locator', exact_match=True):
"""
:arg list good_values: A list of values whose candidates we will
accept results from (default [])
:arg string attr: The attribute type on which to filter
:arg bool exact_match: True if attribute must match a good value exactly.
False if the attribute can be a substring in a
good value. In other words, if our Candidate
attribute is 'US_Rooftop' and one of the good_values
is 'Rooftop', we will keep this candidate.
"""
self._init_helper(vars())
def process(self, candidates):
if self.exact_match is True:
return [c for c in candidates if getattr(c, self.attr) in self.good_values]
else:
return [c for c in candidates if any(gv in getattr(c, self.attr)
for gv in self.good_values)]
def __repr__(self):
return '<%s: %s %s in %s>' % \
(self.__class__.__name__, self.is_exact(), self.attr, self.good_values)
[docs]class AttrExclude(_PostProcessor):
"""
PostProcessor used to filter out results with unwanted attribute values.
"""
def __init__(self, bad_values=[], attr='locator', exact_match=True):
"""
:arg list bad_values: A list of values whose candidates we will
not accept results from (default [])
:arg string attr: The attribute type on which to filter
:arg bool exact_match: True if attribute must match a bad value exactly.
False if the bad value can be a substring of the
attribute value. In other words, if our Candidate
attribute is 'Postcode3' and one of the bad values
is 'Postcode' because we want something more precise,
like 'Address', we will not keep this candidate.
"""
self._init_helper(vars())
def process(self, candidates):
if self.exact_match is True:
return [c for c in candidates if getattr(c, self.attr) not in self.bad_values]
else:
return [c for c in candidates if not any(bv in getattr(c, self.attr) for bv in self.bad_values)]
def __repr__(self):
return '<%s: %s %s in %s>' % \
(self.__class__.__name__, self.is_exact(), self.attr, self.bad_values)
[docs]class AttrListIncludes(_PostProcessor):
"""
PostProcessor used to filter out results without desired attribute list items.
Similar to `AttrFilter` but operates on attributes containing lists instead of scalar values.
"""
def __init__(self, good_values=[], attr='entity_types'):
"""
:arg list good_values: A list of values, one of which must be in the
attribute being filtered on (default [])
:arg string attr: The attribute on which to filter
"""
self._init_helper(vars())
def process(self, candidates):
return [c for c in candidates if any(gv in getattr(c, self.attr)
for gv in self.good_values)]
def __repr__(self):
return '<%s: %s %s in %s>' % \
(self.__class__.__name__, self.attr, self.good_values)
[docs]class AttrListExcludes(_PostProcessor):
"""
PostProcessor used to ditch results with unwanted attribute list items.
Similar to `AttrExclude` but operates on attributes containing lists instead of scalar values.
"""
def __init__(self, bad_values=[], attr='entity_types'):
"""
:arg list bad_values: A list of values, which cannot be in the
attribute being filtered on (default [])
:arg string attr: The attribute on which to filter
"""
self._init_helper(vars())
def process(self, candidates):
return [c for c in candidates if not any(bv in getattr(c, self.attr)
for bv in self.bad_values)]
def __repr__(self):
return '<%s: %s %s in %s>' % \
(self.__class__.__name__, self.attr, self.bad_values)
[docs]class DupePicker(_PostProcessor):
"""
PostProcessor used to choose the best candidate(s)
where there are duplicates (or more than one result
that is very similar*) among high-scoring candidates,
such as an address.
* When comparing attribute values, case and commas do not count.
Usage Example:
================ ===== =======
match_addr score locator
================ ===== =======
123 N Wood St 90 roof
123 S Wood St 90 address
123 N WOOD ST 77 address
123, S Wood ST 85 roof
================ ===== =======
Above, the first two results have the highest scores. We could just
use those, because one of the two likely has the correct address.
However, the second result does not have the most precise location
for 123 S. Wood Street. While the fourth result does not score as
high as the first two, its locator method is more desirable.
Since the addresses are the same, we can assume that the fourth result
will provide better data than the second.
We can get a narrowed list as described above by running the process()
method in the DupePicker() PostProcessor class as follows, assuming
that the "candidates" is our list of candidates::
dp = DupePicker(
attr_dupes='match_addr',
attr_sort='locator',
ordered_list=['rooftop', 'address_point', 'address_range'])
return dp.process(candidates)
Output:
================ ===== =======
match_addr score locator
---------------- ----- -------
123 N Wood St 90 roof
123, S Wood ST 85 roof
================ ===== =======
Output with return_clean=True:
================ ===== =======
match_addr score locator
---------------- ----- -------
123 N WOOD ST 90 roof
123 S WOOD ST 85 roof
================ ===== =======
"""
def __init__(self, attr_dupes, attr_sort, ordered_list, return_clean=False):
"""
:arg str attr_dupes: Property on which to look for duplicates.
:arg str attr_sort: Property on which to sort using ordered_list
:arg list ordered_list: A list of property values, from most desirable
to least desirable.
:arg bool return_clean: Boolean indicating whether or not to
homogenize string values into uppercase
without commas.
"""
self._init_helper(vars())
def process(self, candidates):
def cleanup(str_):
"""Returns string in uppercase and free of commas."""
if type(str_) in [str, str]:
return str_.replace(',', '').upper()
return str_
# if there are no candidates, then there is nothing to do here
if candidates == []:
return []
hi_score = ScoreSorter().process(candidates)[0].score
hi_score_candidates = AttrFilter([hi_score], 'score').process(candidates)
new_candidates = []
for hsc in hi_score_candidates:
# get candidates with same address, including the current one:
attr_match = self.attr_dupes
attr_match_test_val = cleanup(getattr(hsc, attr_match))
# make a list of candidates that have essentially the same value for attr_match (like 123 Main & 123 MAIN)
matching_candidates = [mc for mc in candidates if cleanup(getattr(mc, attr_match)) == attr_match_test_val]
# sort them in the desired order so the first one has the best attribute value
matching_candidates = AttrSorter(self.ordered_list, self.attr_sort).process(matching_candidates)
# the best value available can be grabbed from the top result:
best_attr_value = getattr(matching_candidates[0], attr_match)
# now we can filter results that have best_attr_value:
new_candidates_queue = AttrFilter([best_attr_value], attr_match).process(matching_candidates)
# and append each one to our list of new candidates, if it's not there already:
for nc in [nc for nc in new_candidates_queue if nc not in new_candidates]:
if self.return_clean:
new_candidates.append(cleanup(nc))
else:
new_candidates.append(nc)
return new_candidates
def __repr__(self):
repr_ = '%s: SORT BY %s %s -> GROUP BY %s' % \
(self.__class__.__name__, self.attr_sort, self.ordered_list, self.attr_dupes)
if self.return_clean:
repr_ += ' -> CLEAN'
return '<%s>' % repr_
[docs]class GroupBy(_PostProcessor):
"""
Groups results by a certain attribute by choosing the
first occurrence in the list (this means you will want
to sort ahead of time).
attr -- The attribute on which to combine results
or a list or tuple of attributes where all
attributes must match between candidates.
"""
def __init__(self, attr='match_addr'):
self._init_helper(vars())
def process(self, candidates):
if type(self.attr) in (tuple, list):
return GroupByMultiple(attrs=self.attr).process(candidates)
keepers = []
for c_from_all in candidates[:]:
matches = [c for c in candidates if getattr(c, self.attr) == getattr(c_from_all, self.attr)]
if matches != []:
keepers.append(matches[0])
for m in matches:
candidates.remove(m)
return keepers
def __repr__(self):
return '<%s: %s>' % \
(self.__class__.__name__, self.attr)
[docs]class GroupByMultiple(_PostProcessor):
"""
Groups results by a certain attribute by choosing the
first occurrence in the list of candidates
(this means you will want to sort ahead of time).
attrs -- A list or tuple of attributes on which to combine results
"""
def __init__(self, attrs):
self._init_helper(vars())
def process(self, candidates):
keepers = []
for c_from_all in candidates[:]:
matches = [c for c in candidates
if all([getattr(c, attr) == getattr(c_from_all, attr)
for attr in self.attrs])]
if matches != []:
keepers.append(matches[0])
for m in matches:
candidates.remove(m)
return keepers
def __repr__(self):
return '<%s: %s>' % \
(self.__class__.__name__, self.attrs)
[docs]class SnapPoints(_PostProcessor):
"""
Chooses the first of two or more points where they are within the given
sphere-based great circle distance.
"""
def __init__(self, distance=50):
"""
:arg distance: maximum distance (in metres) between two points in which
the the first will be kept and the second thrown out
(default 50).
"""
self.distance = distance
def _get_distance(self, pnt1, pnt2):
"""Get distance in meters between two lat/long points"""
lat1, lon1 = pnt1
lat2, lon2 = pnt2
radius = 6356752 # km
dlat = math.radians(lat2 - lat1)
dlon = math.radians(lon2 - lon1)
a = math.sin(dlat / 2) * math.sin(dlat / 2) + math.cos(math.radians(lat1)) \
* math.cos(math.radians(lat2)) * math.sin(dlon / 2) * math.sin(dlon / 2)
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
d = radius * c
return d
def _points_within_distance(self, pnt1, pnt2):
"""Returns true if lat/lon points are within given distance in metres."""
if self._get_distance(pnt1, pnt2) <= self.distance:
return True
return False
def process(self, candidates):
keepers = []
for c_from_all in candidates[:]:
matches = [c for c in candidates if
self._points_within_distance((c_from_all.x, c_from_all.y), (c.x, c.y))]
if matches != []:
keepers.append(matches[0])
for m in matches:
candidates.remove(m)
return keepers
def __repr__(self):
return '<%s: distance=%sm>' % \
(self.__class__.__name__, self.distance)