GPS parsing example#
import logging
from pathlib import Path
from bhf_smarthealth.parse import gps_parser as bhf_gps
from bhf_smarthealth.utils import data_io
from bhf_smarthealth.utils import df_utils
from bhf_smarthealth.config import DEFAULT_TIME_PERIOD
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Save to repo to use in testing
FILEOUT_PATH = Path("../../../tests/data/processed_synth_gps_2025-20-09.parquet")
LSOA_path = Path("../../../tests/data/lsoas_for_testing.pkl")
data_source = Path("../../../tests/data/synth_gps_2025-10-09.json")
Load raw JSON file
gps_json = bhf_gps.load_gps_json(data_source)
INFO:bhf_smarthealth.parse.gps_parser:Loaded 828 GPS data points from ../../../tests/data/synth_gps_2025-10-09.json
Validate raw data using Pydantic
gps_val = bhf_gps.validate_gps_data(gps_json)
Convert JSON dict to GeoDataframe
gps_gdf = bhf_gps.gps_to_dataframe(gps_val)
Convert raw GPS latitude and longitude to geometry
gps_gdf = bhf_gps.add_gps_as_geometry(gps_gdf)
gps_gdf.head()
| long | lat | geometry | |
|---|---|---|---|
| datetime | |||
| 2025-10-09 00:00:00 | -1.470228 | 53.380663 | POINT (-1.47023 53.38066) |
| 2025-10-09 00:00:15 | -1.470220 | 53.380797 | POINT (-1.47022 53.3808) |
| 2025-10-09 00:00:27 | -1.470207 | 53.380905 | POINT (-1.47021 53.3809) |
| 2025-10-09 00:00:36 | -1.470205 | 53.380986 | POINT (-1.4702 53.38099) |
| 2025-10-09 00:00:43 | -1.470207 | 53.381049 | POINT (-1.47021 53.38105) |
Calculate distances
gps_gdf["distance"] = bhf_gps.calculate_distances(gps_gdf)
gps_gdf.head()
| long | lat | geometry | distance | |
|---|---|---|---|---|
| datetime | ||||
| 2025-10-09 00:00:00 | -1.470228 | 53.380663 | POINT (-1.47023 53.38066) | NaN |
| 2025-10-09 00:00:15 | -1.470220 | 53.380797 | POINT (-1.47022 53.3808) | 14.995 |
| 2025-10-09 00:00:27 | -1.470207 | 53.380905 | POINT (-1.47021 53.3809) | 11.996 |
| 2025-10-09 00:00:36 | -1.470205 | 53.380986 | POINT (-1.4702 53.38099) | 8.997 |
| 2025-10-09 00:00:43 | -1.470207 | 53.381049 | POINT (-1.47021 53.38105) | 6.998 |
Calculate speeds
gps_gdf["speed"] = bhf_gps.calculate_speeds(gps_gdf)
gps_gdf.head()
| long | lat | geometry | distance | speed | |
|---|---|---|---|---|---|
| datetime | |||||
| 2025-10-09 00:00:00 | -1.470228 | 53.380663 | POINT (-1.47023 53.38066) | NaN | NaN |
| 2025-10-09 00:00:15 | -1.470220 | 53.380797 | POINT (-1.47022 53.3808) | 14.995 | 1.0 |
| 2025-10-09 00:00:27 | -1.470207 | 53.380905 | POINT (-1.47021 53.3809) | 11.996 | 1.0 |
| 2025-10-09 00:00:36 | -1.470205 | 53.380986 | POINT (-1.4702 53.38099) | 8.997 | 1.0 |
| 2025-10-09 00:00:43 | -1.470207 | 53.381049 | POINT (-1.47021 53.38105) | 6.998 | 1.0 |
GPS data cleaning
Remove rows where speed between points is unreasonably large (> 130 kph/35 mps)
Where there is a large time jump, set distance and speed to NULL so not included in calculations
gps_gdf_clean = df_utils.apply_func_list_to_df(
df=gps_gdf,
func_list=[
bhf_gps.filter_speeds,
bhf_gps.filter_time_jumps,
],
)
gps_gdf_clean.head()
| long | lat | geometry | distance | speed | |
|---|---|---|---|---|---|
| datetime | |||||
| 2025-10-09 00:00:00 | -1.470228 | 53.380663 | POINT (-1.47023 53.38066) | NaN | NaN |
| 2025-10-09 00:00:15 | -1.470220 | 53.380797 | POINT (-1.47022 53.3808) | 14.995 | 1.0 |
| 2025-10-09 00:00:27 | -1.470207 | 53.380905 | POINT (-1.47021 53.3809) | 11.996 | 1.0 |
| 2025-10-09 00:00:36 | -1.470205 | 53.380986 | POINT (-1.4702 53.38099) | 8.997 | 1.0 |
| 2025-10-09 00:00:43 | -1.470207 | 53.381049 | POINT (-1.47021 53.38105) | 6.998 | 1.0 |
Load LSOAs and join
lsoa_data = data_io.load_data_from_file(LSOA_path)
gps_lsoa_gdf = bhf_gps.join_spatial_data(gps_gdf_clean, lsoa_data)
gps_lsoa_gdf.head()
| long | lat | geometry | distance | speed | index_right | LSOA21CD | LSOA21NM | |
|---|---|---|---|---|---|---|---|---|
| datetime | ||||||||
| 2025-10-09 00:00:00 | -1.470228 | 53.380663 | POINT (-1.47023 53.38066) | NaN | NaN | 28.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:15 | -1.470220 | 53.380797 | POINT (-1.47022 53.3808) | 14.995 | 1.0 | 28.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:27 | -1.470207 | 53.380905 | POINT (-1.47021 53.3809) | 11.996 | 1.0 | 28.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:36 | -1.470205 | 53.380986 | POINT (-1.4702 53.38099) | 8.997 | 1.0 | 28.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:43 | -1.470207 | 53.381049 | POINT (-1.47021 53.38105) | 6.998 | 1.0 | 28.0 | E01033264 | Sheffield 073D |
Drop uneeded columns
columns = ["lat", "long", "geometry", "index_right"]
lsoa_gdf = df_utils.remove_columns(gps_lsoa_gdf, col_names=columns)
lsoa_gdf.head()
| distance | speed | LSOA21CD | LSOA21NM | |
|---|---|---|---|---|
| datetime | ||||
| 2025-10-09 00:00:00 | NaN | NaN | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:15 | 14.995 | 1.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:27 | 11.996 | 1.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:36 | 8.997 | 1.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:43 | 6.998 | 1.0 | E01033264 | Sheffield 073D |
Set LSOA values to be NULL at bad timepoints, i.e. where the distance is NULL. This is done separate to the time jump filter so that the LSOA value can be kept if needed down the line, for example if the modal LSOA needs to be calculated from the raw data rather than the aggregated interval data.
lsoa_gdf_clean = bhf_gps.nullify_lsoa_at_invalid_timepoints(lsoa_gdf)
lsoa_gdf_clean.head()
| distance | speed | LSOA21CD | LSOA21NM | |
|---|---|---|---|---|
| datetime | ||||
| 2025-10-09 00:00:00 | NaN | NaN | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:15 | 14.995 | 1.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:27 | 11.996 | 1.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:36 | 8.997 | 1.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:00:43 | 6.998 | 1.0 | E01033264 | Sheffield 073D |
Aggregate data into time windows. The latest LSOA in the time window is selected. If the latest LSOA value in the time window is associated with poor quality data (e.g. after a time jump) then the most recent LSOA is taken, moving backwards.
agg_dict = {
"distance": ["size", "sum"],
"speed": ["mean"],
"LSOA21CD": ["last"],
"LSOA21NM": ["last"],
}
lsoa_agg = df_utils.apply_agg_over_interval(
lsoa_gdf_clean, agg_dict=agg_dict, time_period=DEFAULT_TIME_PERIOD
)
lsoa_agg.head()
| distance_size | distance_sum | speed_mean | LSOA21CD_last | LSOA21NM_last | |
|---|---|---|---|---|---|
| datetime | |||||
| 2025-10-09 00:00:00 | 1.0 | NaN | NaN | E01033264 | Sheffield 073D |
| 2025-10-09 00:05:00 | 7.0 | 66.978 | 1.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:10:00 | 5.0 | 31.989 | 1.0 | E01033270 | Sheffield 022H |
| 2025-10-09 01:00:00 | 18.0 | 172.938 | 1.0 | E01008063 | Sheffield 011B |
| 2025-10-09 01:45:00 | 2.0 | 14.995 | 1.0 | E01008138 | Sheffield 006D |
Rename columns for clarity
lsoa_agg_renamed = df_utils.rename_columns(
df=lsoa_agg,
col_names=[
"LSOA_n_samples",
"dist_travelled",
"mean_speed",
"latest_LSOA21CD",
"latest_LSOA21NM",
],
)
lsoa_agg_renamed.head()
| LSOA_n_samples | dist_travelled | mean_speed | latest_LSOA21CD | latest_LSOA21NM | |
|---|---|---|---|---|---|
| datetime | |||||
| 2025-10-09 00:00:00 | 1.0 | NaN | NaN | E01033264 | Sheffield 073D |
| 2025-10-09 00:05:00 | 7.0 | 66.978 | 1.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:10:00 | 5.0 | 31.989 | 1.0 | E01033270 | Sheffield 022H |
| 2025-10-09 01:00:00 | 18.0 | 172.938 | 1.0 | E01008063 | Sheffield 011B |
| 2025-10-09 01:45:00 | 2.0 | 14.995 | 1.0 | E01008138 | Sheffield 006D |
Save data
data_io.save_dataframe_ouput(lsoa_agg_renamed, FILEOUT_PATH)
Re-read in the data to check its integrity
test_df = data_io.load_data_from_file(FILEOUT_PATH)
test_df.head()
| LSOA_n_samples | dist_travelled | mean_speed | latest_LSOA21CD | latest_LSOA21NM | |
|---|---|---|---|---|---|
| datetime | |||||
| 2025-10-09 00:00:00 | 1.0 | NaN | NaN | E01033264 | Sheffield 073D |
| 2025-10-09 00:05:00 | 7.0 | 66.978 | 1.0 | E01033264 | Sheffield 073D |
| 2025-10-09 00:10:00 | 5.0 | 31.989 | 1.0 | E01033270 | Sheffield 022H |
| 2025-10-09 01:00:00 | 18.0 | 172.938 | 1.0 | E01008063 | Sheffield 011B |
| 2025-10-09 01:45:00 | 2.0 | 14.995 | 1.0 | E01008138 | Sheffield 006D |