""" pandashp: read/write shapefiles to/from special DataFrames
Offers two functions read_shp and write_shp that convert ESRI shapefiles to
pandas DataFrames that can be manipulated at will and then written back to
shapefiles. Opens up data manipulation capabilities beyond a simple GIS field
calculator.
Usage:
import pandashp as pdshp
# calculate population density from shapefile of cities (stupid, I know)
cities = pdshp.read_shp('cities_germany_projected')
cities['popdens'] = cities['population'] / cities['area']
pdshp.write_shp(cities, 'cities_germany_projected_popdens')
"""
try:
from itertools import izip as zip
except ImportError: # zip is a builtin in Python 3.x
pass
import numpy as np
import pandas as pd
import shapefile
from . import shapelytools
import warnings
from shapely.geometry import LineString, Point, Polygon
[docs]def read_shp(filename):
"""Read shapefile to dataframe w/ geometry.
Args:
filename: ESRI shapefile name to be read (without .shp extension)
Returns:
pandas DataFrame with column geometry, containing individual shapely
Geometry objects (i.e. Point, LineString, Polygon) depending on
the shapefiles original shape type
"""
sr = shapefile.Reader(filename)
cols = sr.fields[:] # [:] = duplicate field list
if cols[0][0] == 'DeletionFlag':
cols.pop(0)
cols = [col[0] for col in cols] # extract field name only
cols.append('geometry')
records = [row for row in sr.iterRecords()]
if sr.shapeType == shapefile.POLYGON:
geometries = [Polygon(shape.points)
if len(shape.points) > 2 else np.NaN # invalid geometry
for shape in sr.iterShapes()]
elif sr.shapeType == shapefile.POLYLINE:
geometries = [LineString(shape.points) for shape in sr.iterShapes()]
elif sr.shapeType == shapefile.POINT:
geometries = [Point(*shape.points[0]) for shape in sr.iterShapes()]
else:
raise NotImplementedError
data = [r + [g] for r, g in zip(records, geometries)]
df = pd.DataFrame(data, columns=cols)
df = df.convert_objects(convert_numeric=True)
if np.NaN in geometries:
# drop invalid geometries
df = df.dropna(subset=['geometry'])
num_skipped = len(geometries) - len(df)
warnings.warn('Skipped {} invalid geometrie(s).'.format(num_skipped))
return df
[docs]def write_shp(filename, dataframe, write_index=True):
"""Write dataframe w/ geometry to shapefile.
Args:
filename: ESRI shapefile name to be written (without .shp extension)
dataframe: a pandas DataFrame with column geometry and homogenous
shape types (Point, LineString, or Polygon)
write_index: add index as column to attribute tabel (default: true)
Returns:
Nothing.
"""
df = dataframe.copy()
if write_index:
df.reset_index(inplace=True)
# split geometry column from dataframe
geometry = df.pop('geometry')
# write geometries to shp/shx, according to geometry type
if isinstance(geometry.iloc[0], Point):
sw = shapefile.Writer(shapefile.POINT)
for point in geometry:
sw.point(point.x, point.y)
elif isinstance(geometry.iloc[0], LineString):
sw = shapefile.Writer(shapefile.POLYLINE)
for line in geometry:
sw.line([list(line.coords)])
elif isinstance(geometry.iloc[0], Polygon):
sw = shapefile.Writer(shapefile.POLYGON)
for polygon in geometry:
sw.poly([list(polygon.exterior.coords)])
else:
raise NotImplementedError
# add fields for dbf
for k, column in enumerate(df.columns):
# unicode strings freak out pyshp, so remove u'..'
column = str(column)
if np.issubdtype(df.dtypes[k], np.number):
# detect and convert integer-only columns
if (df[column] % 1 == 0).all():
df[column] = df[column].astype(np.integer)
# now create the appropriate fieldtype
if np.issubdtype(df.dtypes[k], np.floating):
sw.field(column, 'N', decimal=5)
else:
sw.field(column, 'I', decimal=0)
else:
sw.field(column)
# add records to dbf
for record in df.itertuples():
sw.record(*record[1:]) # drop first tuple element (=index)
sw.save(filename)
[docs]def match_vertices_and_edges(vertices, edges, vertex_cols=('Vertex1', 'Vertex2')):
"""Adds unique IDs to vertices and corresponding edges.
Identifies, which nodes coincide with the endpoints of edges and creates
matching IDs for matching points, thus creating a node-edge graph whose
edges are encoded purely by node ID pairs. The optional argument
vertex_cols specifies which DataFrame columns of edges are added, default
is 'Vertex1' and 'Vertex2'.
Args:
vertices: pandas DataFrame with geometry column of type Point
edges: pandas DataFrame with geometry column of type LineString
vertex_cols: tuple of 2 strings for the IDs numbers
Returns:
Nothing, the mathing IDs are added to the columns vertex_cols in
argument edges
"""
vertex_indices = []
for e, line in enumerate(edges.geometry):
edge_endpoints = []
for k, vertex in enumerate(vertices.geometry):
# change start -------
# if line.touches(vertex) or line.intersects(vertex):
# edge_endpoints.append(vertices.index[k])
if vertex.buffer(0.001).intersects(line) or line.intersects(vertex):
edge_endpoints.append(vertices.index[k])
# change stop -------
if len(edge_endpoints) == 0:
warnings.warn("edge " + str(e) +
" has no endpoints: " + str(edge_endpoints))
elif len(edge_endpoints) == 1:
warnings.warn("edge " + str(e) +
" has only 1 endpoint: " + str(edge_endpoints))
vertex_indices.append(edge_endpoints)
edges[vertex_cols[0]] = pd.Series([min(n1n2) for n1n2 in vertex_indices],
index=edges.index)
edges[vertex_cols[1]] = pd.Series([max(n1n2) for n1n2 in vertex_indices],
index=edges.index)
[docs]def find_closest_edge(polygons, edges, to_attr='index', column='nearest'):
"""Find closest edge for centroid of polygons.
Args:
polygons: a pandas DataFrame with geometry column of Polygons
edges: a pandas DataFrame with geometry column of LineStrings
to_attr: a column name in DataFrame edges (default: index)
column: a column name to be added/overwrite in DataFrame polygons with
the value of column to_attr from the nearest edge in edges
Returns:
a list of LineStrings connecting polygons' centroids with the nearest
point in in edges. Side effect: polygons recieves new column with the
attribute value of nearest edge. Warning: if column exists, it is
overwritten.
"""
connecting_lines = []
nearest_indices = []
centroids = [b.centroid for b in polygons['geometry']]
for centroid in centroids:
nearest_edge, _, nearest_index = shapelytools.closest_object(
edges['geometry'], centroid)
nearest_point = shapelytools.project_point_to_object(
centroid, nearest_edge)
connecting_lines.append(LineString(tuple(centroid.coords) +
tuple(nearest_point.coords)))
nearest_indices.append(edges[to_attr][nearest_index])
polygons[column] = pd.Series(nearest_indices, index=polygons.index)
return pd.DataFrame({'geometry': connecting_lines})
[docs]def bounds(df):
"""Return a DataFrame of minx, miny, maxx, maxy of each geometry."""
bounds = np.array([geom.bounds for geom in df.geometry])
return pd.DataFrame(bounds,
columns=['minx', 'miny', 'maxx', 'maxy'],
index=df.index)
[docs]def total_bounds(df):
"""Return bounding box (minx, miny, maxx, maxy) of all geometries. """
b = bounds(df)
return (b['minx'].min(),
b['miny'].min(),
b['maxx'].max(),
b['maxy'].max())