1
1
mirror of https://github.com/OpenVoiceOS/OpenVoiceOS synced 2025-04-15 10:58:02 +02:00
j1nx c6460b9307 [WIP] Pushed for backup.
... Do not build this as of yet ...
2023-06-01 15:16:04 +02:00

492 lines
21 KiB
Python
Executable File

import json
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from struct import unpack
from typing import List, Optional, Union
import numpy as np
from h3.api import numpy_int as h3
from timezonefinder import utils
from timezonefinder.configs import (
BINARY_DATA_ATTRIBUTES,
BINARY_FILE_ENDING,
DATA_ATTRIBUTE_NAMES,
DTYPE_FORMAT_H,
DTYPE_FORMAT_H_NUMPY,
DTYPE_FORMAT_I,
DTYPE_FORMAT_SIGNED_I_NUMPY,
HOLE_ADR2DATA,
HOLE_COORD_AMOUNT,
HOLE_DATA,
HOLE_REGISTRY,
HOLE_REGISTRY_FILE,
NR_BYTES_H,
NR_BYTES_I,
POLY_ADR2DATA,
POLY_COORD_AMOUNT,
POLY_DATA,
POLY_MAX_VALUES,
POLY_NR2ZONE_ID,
POLY_ZONE_IDS,
SHORTCUT_FILE,
SHORTCUT_H3_RES,
TIMEZONE_NAMES_FILE,
CoordLists,
CoordPairs,
)
from timezonefinder.hex_helpers import read_shortcuts_binary
from timezonefinder.utils import inside_polygon
class AbstractTimezoneFinder(ABC):
# TODO document attributes in all classes
# prevent dynamic attribute assignment (-> safe memory)
__slots__ = [
"bin_file_location",
"shortcut_mapping",
"in_memory",
"_fromfile",
"timezone_names",
POLY_ZONE_IDS,
]
binary_data_attributes: List[str] = [POLY_ZONE_IDS]
def __init__(
self,
bin_file_location: Optional[Union[str, Path]] = None,
in_memory: bool = False,
):
self.in_memory = in_memory
if self.in_memory:
self._fromfile = utils.fromfile_memory
else:
self._fromfile = np.fromfile
# open all the files in binary reading mode
# for more info on what is stored in which .bin file, please read the comments in file_converter.py
if bin_file_location is None:
bin_file_location = Path(__file__).parent
else:
bin_file_location = Path(bin_file_location)
self.bin_file_location: Path = bin_file_location
with open(self.bin_file_location / TIMEZONE_NAMES_FILE) as json_file:
self.timezone_names = json.loads(json_file.read())
path2shortcut_bin = self.bin_file_location / SHORTCUT_FILE
self.shortcut_mapping = read_shortcuts_binary(path2shortcut_bin)
for attribute_name in self.binary_data_attributes:
file_name = attribute_name + BINARY_FILE_ENDING
path2file = self.bin_file_location / file_name
if self.in_memory:
with open(path2file, mode="rb") as bin_file:
bf_in_mem = BytesIO(bin_file.read())
bf_in_mem.seek(0)
setattr(self, attribute_name, bf_in_mem)
else:
bin_file = open(path2file, mode="rb")
setattr(self, attribute_name, bin_file)
def __del__(self):
for attribute_name in self.binary_data_attributes:
getattr(self, attribute_name).close()
@property
def nr_of_zones(self):
return len(self.timezone_names)
@staticmethod
def using_numba() -> bool:
"""
:return: True if Numba is being used to JIT compile helper functions
"""
return utils.using_numba
@staticmethod
def using_clang_pip() -> bool:
"""
:return: True if the compiled C implementation of the point in polygon algorithm is being used
"""
return utils.inside_polygon == utils.pt_in_poly_clang
def zone_id_of(self, poly_id: int) -> int:
poly_zone_ids = getattr(self, POLY_ZONE_IDS)
poly_zone_ids.seek(NR_BYTES_H * poly_id)
return unpack(DTYPE_FORMAT_H, poly_zone_ids.read(NR_BYTES_H))[0]
def zone_ids_of(self, poly_ids: np.ndarray) -> np.ndarray:
poly_zone_ids = getattr(self, POLY_ZONE_IDS)
id_array = np.empty(shape=len(poly_ids), dtype=DTYPE_FORMAT_H_NUMPY)
for i, poly_id in enumerate(poly_ids):
poly_zone_ids.seek(NR_BYTES_H * poly_id)
id_array[i] = unpack(DTYPE_FORMAT_H, poly_zone_ids.read(NR_BYTES_H))[0]
return id_array
def zone_name_from_id(self, zone_id: int) -> str:
try:
return self.timezone_names[zone_id]
except IndexError:
raise ValueError("timezone could not be found. index error.")
def zone_name_from_poly_id(self, poly_id: int) -> str:
zone_id = self.zone_id_of(poly_id)
return self.zone_name_from_id(zone_id)
def get_shortcut_polys(self, *, lng: float, lat: float) -> np.ndarray:
hex_id = h3.geo_to_h3(lat, lng, SHORTCUT_H3_RES)
shortcut_poly_ids = self.shortcut_mapping[hex_id]
return shortcut_poly_ids
def most_common_zone_id(self, *, lng: float, lat: float) -> Optional[int]:
polys = self.get_shortcut_polys(lng=lng, lat=lat)
if len(polys) == 0:
return None
# Note: polygons are sorted from small to big in the shortcuts (grouped by zone)
# -> the polygons of the biggest zone come last
poly_of_biggest_zone = polys[-1]
return self.zone_id_of(poly_of_biggest_zone)
def unique_zone_id(self, *, lng: float, lat: float) -> Optional[int]:
"""
:return: the zone id at the coordinate if there is exactly one possible zone, else `None`
"""
polys = self.get_shortcut_polys(lng=lng, lat=lat)
if len(polys) == 0:
return None
if len(polys) == 1:
return self.zone_id_of(polys[0])
zones = self.zone_ids_of(polys)
zones_unique = np.unique(zones)
if len(zones_unique) == 1:
return zones_unique[0]
# more than one zone in this shortcut
return None
@abstractmethod
def timezone_at(self, *, lng: float, lat: float) -> Optional[str]:
"""looks up in which timezone the given coordinate is included in
:param lng: longitude of the point in degree (-180.0 to 180.0)
:param lat: latitude in degree (90.0 to -90.0)
:return: the timezone name of a matching polygon or None
"""
...
def timezone_at_land(self, *, lng: float, lat: float) -> Optional[str]:
"""computes in which land timezone a point is included in
Especially for large polygons it is expensive to check if a point is really included.
To speed things up there are "shortcuts" being used (stored in a binary file),
which have been precomputed and store which timezone polygons have to be checked.
:param lng: longitude of the point in degree (-180.0 to 180.0)
:param lat: latitude in degree (90.0 to -90.0)
:return: the timezone name of a matching polygon or
``None`` when an ocean timezone ("Etc/GMT+-XX") has been matched.
"""
tz_name = self.timezone_at(lng=lng, lat=lat)
if tz_name is not None and utils.is_ocean_timezone(tz_name):
return None
return tz_name
def unique_timezone_at(self, *, lng: float, lat: float) -> Optional[str]:
"""returns the name of a unique zone within the corresponding shortcut
:param lng: longitude of the point in degree (-180.0 to 180.0)
:param lat: latitude in degree (90.0 to -90.0)
:return: the timezone name of the unique zone or ``None`` if there are no or multiple zones in this shortcut
"""
lng, lat = utils.validate_coordinates(lng, lat)
unique_id = self.unique_zone_id(lng=lng, lat=lat)
if unique_id is None:
return None
return self.zone_name_from_id(unique_id)
class TimezoneFinderL(AbstractTimezoneFinder):
"""a 'light' version of the TimezoneFinder class for quickly suggesting a timezone for a point on earth
Instead of using timezone polygon data like ``TimezoneFinder``,
this class only uses a precomputed 'shortcut' to suggest a probable result:
the most common zone in a rectangle of a half degree of latitude and one degree of longitude
"""
def timezone_at(self, *, lng: float, lat: float) -> Optional[str]:
"""instantly returns the name of the most common zone within the corresponding shortcut
Note: 'most common' in this context means that the polygons with the most coordinates in sum
occurring in the corresponding shortcut belong to this zone.
:param lng: longitude of the point in degree (-180.0 to 180.0)
:param lat: latitude in degree (90.0 to -90.0)
:return: the timezone name of the most common zone or None if there are no timezone polygons in this shortcut
"""
lng, lat = utils.validate_coordinates(lng, lat)
most_common_id = self.most_common_zone_id(lng=lng, lat=lat)
if most_common_id is None:
return None
return self.zone_name_from_id(most_common_id)
class TimezoneFinder(AbstractTimezoneFinder):
"""Class for quickly finding the timezone of a point on earth offline.
Because of indexing ("shortcuts"), not all timezone polygons have to be tested during a query.
Opens the required timezone polygon data in binary files to enable fast access.
For a detailed documentation of data management please refer to the code documentation of
`file_converter.py <https://github.com/jannikmi/timezonefinder/blob/master/scripts/file_converter.py>`__
:ivar binary_data_attributes: the names of all attributes which store the opened binary data files
:param bin_file_location: path to the binary data files to use, None if native package data should be used
:param in_memory: whether to completely read and keep the binary files in memory
"""
# __slots__ declared in parents are available in child classes. However, child subclasses will get a __dict__
# and __weakref__ unless they also define __slots__ (which should only contain names of any additional slots).
__slots__ = DATA_ATTRIBUTE_NAMES
binary_data_attributes = BINARY_DATA_ATTRIBUTES
def __init__(self, bin_file_location: Optional[str] = None, in_memory: bool = False):
super().__init__(bin_file_location, in_memory)
# stores for which polygons (how many) holes exits and the id of the first of those holes
# since there are very few it is feasible to keep them in the memory
with open(self.bin_file_location / HOLE_REGISTRY_FILE) as json_file:
hole_registry_tmp = json.loads(json_file.read())
# convert the json string keys to int
hole_registry = {int(k): v for k, v in hole_registry_tmp.items()}
setattr(self, HOLE_REGISTRY, hole_registry)
@property
def nr_of_polygons(self) -> int:
poly_zone_ids = getattr(self, POLY_ZONE_IDS)
return utils.get_file_size_byte(poly_zone_ids) // NR_BYTES_H
def coords_of(self, polygon_nr: int = 0) -> np.ndarray:
poly_coord_amount = getattr(self, POLY_COORD_AMOUNT)
poly_adr2data = getattr(self, POLY_ADR2DATA)
poly_data = getattr(self, POLY_DATA)
# how many coordinates are stored in this polygon
poly_coord_amount.seek(NR_BYTES_I * polygon_nr)
nr_of_values = unpack(DTYPE_FORMAT_I, poly_coord_amount.read(NR_BYTES_I))[0]
poly_adr2data.seek(NR_BYTES_I * polygon_nr)
poly_data.seek(unpack(DTYPE_FORMAT_I, poly_adr2data.read(NR_BYTES_I))[0])
return np.stack(
(
self._fromfile(poly_data, dtype=DTYPE_FORMAT_SIGNED_I_NUMPY, count=nr_of_values),
self._fromfile(poly_data, dtype=DTYPE_FORMAT_SIGNED_I_NUMPY, count=nr_of_values),
)
)
def _holes_of_poly(self, polygon_nr: int):
hole_coord_amount = getattr(self, HOLE_COORD_AMOUNT)
hole_adr2data = getattr(self, HOLE_ADR2DATA)
hole_data = getattr(self, HOLE_DATA)
hole_registry = getattr(self, HOLE_REGISTRY)
try:
amount_of_holes, first_hole_id = hole_registry[polygon_nr]
except KeyError:
return
hole_coord_amount.seek(NR_BYTES_H * first_hole_id)
hole_adr2data.seek(NR_BYTES_I * first_hole_id)
for _ in range(amount_of_holes):
nr_of_values = unpack(DTYPE_FORMAT_H, hole_coord_amount.read(NR_BYTES_H))[0]
hole_data.seek(unpack(DTYPE_FORMAT_I, hole_adr2data.read(NR_BYTES_I))[0])
x_coords = self._fromfile(hole_data, dtype=DTYPE_FORMAT_SIGNED_I_NUMPY, count=nr_of_values)
y_coords = self._fromfile(hole_data, dtype=DTYPE_FORMAT_SIGNED_I_NUMPY, count=nr_of_values)
yield np.array(
[
x_coords,
y_coords,
]
)
def get_polygon(self, polygon_nr: int, coords_as_pairs: bool = False) -> List[Union[CoordPairs, CoordLists]]:
list_of_converted_polygons = []
if coords_as_pairs:
conversion_method = utils.convert2coord_pairs
else:
conversion_method = utils.convert2coords
list_of_converted_polygons.append(conversion_method(self.coords_of(polygon_nr=polygon_nr)))
for hole in self._holes_of_poly(polygon_nr):
list_of_converted_polygons.append(conversion_method(hole))
return list_of_converted_polygons
def get_geometry(
self,
tz_name: Optional[str] = "",
tz_id: Optional[int] = 0,
use_id: bool = False,
coords_as_pairs: bool = False,
):
"""retrieves the geometry of a timezone polygon
:param tz_name: one of the names in ``timezone_names.json`` or ``self.timezone_names``
:param tz_id: the id of the timezone (=index in ``self.timezone_names``)
:param use_id: if ``True`` uses ``tz_id`` instead of ``tz_name``
:param coords_as_pairs: determines the structure of the polygon representation
:return: a data structure representing the multipolygon of this timezone
output format: ``[ [polygon1, hole1, hole2...], [polygon2, ...], ...]``
and each polygon and hole is itself formatted like: ``([longitudes], [latitudes])``
or ``[(lng1,lat1), (lng2,lat2),...]`` if ``coords_as_pairs=True``.
"""
if use_id:
if not isinstance(tz_id, int):
raise TypeError("the zone id must be given as int.")
if tz_id < 0 or tz_id >= self.nr_of_zones:
raise ValueError(f"the given zone id {tz_id} is invalid (value range: 0 - {self.nr_of_zones - 1}.")
else:
try:
tz_id = self.timezone_names.index(tz_name)
except ValueError:
raise ValueError("The timezone '", tz_name, "' does not exist.")
if tz_id is None:
raise ValueError("no timezone id given.")
poly_id2zone_id = getattr(self, POLY_NR2ZONE_ID)
poly_id2zone_id.seek(NR_BYTES_H * tz_id)
# read poly_id of the first polygon of that zone
this_zone_poly_id = unpack(DTYPE_FORMAT_H, poly_id2zone_id.read(NR_BYTES_H))[0]
# read poly_id of the first polygon of the consequent zone
# (also exists for the last zone, cf. file_converter.py)
next_zone_poly_id = unpack(DTYPE_FORMAT_H, poly_id2zone_id.read(NR_BYTES_H))[0]
# read and return all polygons from this zone:
return [self.get_polygon(poly_id, coords_as_pairs) for poly_id in range(this_zone_poly_id, next_zone_poly_id)]
def outside_the_boundaries_of(self, poly_id: int, x: int, y: int) -> bool:
# get the boundaries of the polygon = (lng_max, lng_min, lat_max, lat_min) converted to int32
poly_max_values = getattr(self, POLY_MAX_VALUES)
poly_max_values.seek(4 * NR_BYTES_I * poly_id)
xmax, xmin, ymax, ymin = self._fromfile(
poly_max_values,
dtype=DTYPE_FORMAT_SIGNED_I_NUMPY,
count=4,
)
return x > xmax or x < xmin or y > ymax or y < ymin
def inside_of_polygon(self, poly_id: int, x: int, y: int) -> bool:
# only read polygon (hole) data on demand
# only run the expensive algorithm if the point is withing the boundaries
if self.outside_the_boundaries_of(poly_id, x, y):
return False
if not inside_polygon(x, y, self.coords_of(polygon_nr=poly_id)):
return False
# when the point is within a hole of the polygon, this timezone must not be returned
if any(iter(inside_polygon(x, y, hole) for hole in self._holes_of_poly(poly_id))):
return False
# the query point is included in this polygon, but not any hole
return True
def timezone_at(self, *, lng: float, lat: float) -> Optional[str]:
"""computes in which ocean OR land timezone a point is included in
Especially for large polygons it is expensive to check if a point is really included.
In case there is only one possible zone (left), this zone will instantly be returned without actually checking
if the query point is included in this polygon.
To speed things up there are "shortcuts" being used
which have been precomputed and store which timezone polygons have to be checked.
.. note:: Since ocean timezones span the whole globe, some timezone will always be matched!
`None` can only be returned when you have compiled timezone data without such "full coverage".
:param lng: longitude of the point in degree (-180.0 to 180.0)
:param lat: latitude in degree (90.0 to -90.0)
:return: the timezone name of the matched timezone polygon. possibly "Etc/GMT+-XX" in case of an ocean timezone.
"""
lng, lat = utils.validate_coordinates(lng, lat)
possible_polygons = self.get_shortcut_polys(lng=lng, lat=lat)
nr_possible_polygons = len(possible_polygons)
if nr_possible_polygons == 0:
# Note: hypothetical case, with ocean data every shortcut maps to at least one polygon
return None
if nr_possible_polygons == 1:
# there is only one polygon in that area. return its timezone name without further checks
polygon_id = possible_polygons[0]
return self.zone_name_from_poly_id(polygon_id)
# create a list of all the timezone ids of all possible polygons
zone_ids = self.zone_ids_of(possible_polygons)
last_zone_change_idx = utils.get_last_change_idx(zone_ids)
if last_zone_change_idx == 0:
return self.zone_name_from_id(zone_ids[0])
# ATTENTION: the polygons are stored converted to 32-bit ints,
# convert the query coordinates in the same fashion in order to make the data formats match
# x = longitude y = latitude both converted to 8byte int
x = utils.coord2int(lng)
y = utils.coord2int(lat)
# check until the point is included in one of the possible polygons
for i, poly_id in enumerate(possible_polygons):
if i >= last_zone_change_idx:
break
if self.inside_of_polygon(poly_id, x, y):
zone_id = zone_ids[i]
return self.zone_name_from_id(zone_id)
# since it is the last possible option,
# the polygons of the last possible zone don't actually have to be checked
zone_id = zone_ids[-1]
return self.zone_name_from_id(zone_id)
def certain_timezone_at(self, *, lng: float, lat: float) -> Optional[str]:
"""checks in which timezone polygon the point is certainly included in
.. note:: this is only meaningful when you have compiled your own timezone data
where there are areas without timezone polygon coverage.
Otherwise some timezone will always be matched and the functionality is equal to using `.timezone_at()`
-> useless to actually test all polygons.
.. note:: using this function is less performant than `.timezone_at()`
:param lng: longitude of the point in degree
:param lat: latitude in degree
:return: the timezone name of the polygon the point is included in or `None`
"""
lng, lat = utils.validate_coordinates(lng, lat)
possible_polygons = self.get_shortcut_polys(lng=lng, lat=lat)
nr_possible_polygons = len(possible_polygons)
if nr_possible_polygons == 0:
# Note: hypothetical case, with ocean data every shortcut maps to at least one polygon
return None
# ATTENTION: the polygons are stored converted to 32-bit ints,
# convert the query coordinates in the same fashion in order to make the data formats match
# x = longitude y = latitude both converted to 8byte int
x = utils.coord2int(lng)
y = utils.coord2int(lat)
# check if the query point is found to be truly included in one of the possible polygons
for poly_id in possible_polygons:
if self.inside_of_polygon(poly_id, x, y):
zone_id = self.zone_id_of(poly_id)
return self.zone_name_from_id(zone_id)
# none of the polygon candidates truly matched
return None