GPS parsing example

GPS parsing example#

import logging
from pathlib import Path

from bhf_smarthealth.parse import gps_parser as bhf_gps
from bhf_smarthealth.utils import data_io
from bhf_smarthealth.utils import df_utils
from bhf_smarthealth.config import DEFAULT_TIME_PERIOD

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Save to repo to use in testing
FILEOUT_PATH = Path("../../../tests/data/processed_synth_gps_2025-20-09.parquet")
LSOA_path = Path("../../../tests/data/lsoas_for_testing.pkl")

data_source = Path("../../../tests/data/synth_gps_2025-10-09.json")

Load raw JSON file

gps_json = bhf_gps.load_gps_json(data_source)
INFO:bhf_smarthealth.parse.gps_parser:Loaded 828 GPS data points from ../../../tests/data/synth_gps_2025-10-09.json

Validate raw data using Pydantic

gps_val = bhf_gps.validate_gps_data(gps_json)

Convert JSON dict to GeoDataframe

gps_gdf = bhf_gps.gps_to_dataframe(gps_val)

Convert raw GPS latitude and longitude to geometry

gps_gdf = bhf_gps.add_gps_as_geometry(gps_gdf)
gps_gdf.head()
long lat geometry
datetime
2025-10-09 00:00:00 -1.470228 53.380663 POINT (-1.47023 53.38066)
2025-10-09 00:00:15 -1.470220 53.380797 POINT (-1.47022 53.3808)
2025-10-09 00:00:27 -1.470207 53.380905 POINT (-1.47021 53.3809)
2025-10-09 00:00:36 -1.470205 53.380986 POINT (-1.4702 53.38099)
2025-10-09 00:00:43 -1.470207 53.381049 POINT (-1.47021 53.38105)

Calculate distances

gps_gdf["distance"] = bhf_gps.calculate_distances(gps_gdf)
gps_gdf.head()
long lat geometry distance
datetime
2025-10-09 00:00:00 -1.470228 53.380663 POINT (-1.47023 53.38066) NaN
2025-10-09 00:00:15 -1.470220 53.380797 POINT (-1.47022 53.3808) 14.995
2025-10-09 00:00:27 -1.470207 53.380905 POINT (-1.47021 53.3809) 11.996
2025-10-09 00:00:36 -1.470205 53.380986 POINT (-1.4702 53.38099) 8.997
2025-10-09 00:00:43 -1.470207 53.381049 POINT (-1.47021 53.38105) 6.998

Calculate speeds

gps_gdf["speed"] = bhf_gps.calculate_speeds(gps_gdf)
gps_gdf.head()
long lat geometry distance speed
datetime
2025-10-09 00:00:00 -1.470228 53.380663 POINT (-1.47023 53.38066) NaN NaN
2025-10-09 00:00:15 -1.470220 53.380797 POINT (-1.47022 53.3808) 14.995 1.0
2025-10-09 00:00:27 -1.470207 53.380905 POINT (-1.47021 53.3809) 11.996 1.0
2025-10-09 00:00:36 -1.470205 53.380986 POINT (-1.4702 53.38099) 8.997 1.0
2025-10-09 00:00:43 -1.470207 53.381049 POINT (-1.47021 53.38105) 6.998 1.0

GPS data cleaning

  • Remove rows where speed between points is unreasonably large (> 130 kph/35 mps)

  • Where there is a large time jump, set distance and speed to NULL so not included in calculations

gps_gdf_clean = df_utils.apply_func_list_to_df(
    df=gps_gdf,
    func_list=[
        bhf_gps.filter_speeds,
        bhf_gps.filter_time_jumps,
    ],
)

gps_gdf_clean.head()
long lat geometry distance speed
datetime
2025-10-09 00:00:00 -1.470228 53.380663 POINT (-1.47023 53.38066) NaN NaN
2025-10-09 00:00:15 -1.470220 53.380797 POINT (-1.47022 53.3808) 14.995 1.0
2025-10-09 00:00:27 -1.470207 53.380905 POINT (-1.47021 53.3809) 11.996 1.0
2025-10-09 00:00:36 -1.470205 53.380986 POINT (-1.4702 53.38099) 8.997 1.0
2025-10-09 00:00:43 -1.470207 53.381049 POINT (-1.47021 53.38105) 6.998 1.0

Load LSOAs and join

lsoa_data = data_io.load_data_from_file(LSOA_path)

gps_lsoa_gdf = bhf_gps.join_spatial_data(gps_gdf_clean, lsoa_data)

gps_lsoa_gdf.head()
long lat geometry distance speed index_right LSOA21CD LSOA21NM
datetime
2025-10-09 00:00:00 -1.470228 53.380663 POINT (-1.47023 53.38066) NaN NaN 28.0 E01033264 Sheffield 073D
2025-10-09 00:00:15 -1.470220 53.380797 POINT (-1.47022 53.3808) 14.995 1.0 28.0 E01033264 Sheffield 073D
2025-10-09 00:00:27 -1.470207 53.380905 POINT (-1.47021 53.3809) 11.996 1.0 28.0 E01033264 Sheffield 073D
2025-10-09 00:00:36 -1.470205 53.380986 POINT (-1.4702 53.38099) 8.997 1.0 28.0 E01033264 Sheffield 073D
2025-10-09 00:00:43 -1.470207 53.381049 POINT (-1.47021 53.38105) 6.998 1.0 28.0 E01033264 Sheffield 073D

Drop uneeded columns

columns = ["lat", "long", "geometry", "index_right"]
lsoa_gdf = df_utils.remove_columns(gps_lsoa_gdf, col_names=columns)
lsoa_gdf.head()
distance speed LSOA21CD LSOA21NM
datetime
2025-10-09 00:00:00 NaN NaN E01033264 Sheffield 073D
2025-10-09 00:00:15 14.995 1.0 E01033264 Sheffield 073D
2025-10-09 00:00:27 11.996 1.0 E01033264 Sheffield 073D
2025-10-09 00:00:36 8.997 1.0 E01033264 Sheffield 073D
2025-10-09 00:00:43 6.998 1.0 E01033264 Sheffield 073D

Set LSOA values to be NULL at bad timepoints, i.e. where the distance is NULL. This is done separate to the time jump filter so that the LSOA value can be kept if needed down the line, for example if the modal LSOA needs to be calculated from the raw data rather than the aggregated interval data.

lsoa_gdf_clean = bhf_gps.nullify_lsoa_at_invalid_timepoints(lsoa_gdf)
lsoa_gdf_clean.head()
distance speed LSOA21CD LSOA21NM
datetime
2025-10-09 00:00:00 NaN NaN E01033264 Sheffield 073D
2025-10-09 00:00:15 14.995 1.0 E01033264 Sheffield 073D
2025-10-09 00:00:27 11.996 1.0 E01033264 Sheffield 073D
2025-10-09 00:00:36 8.997 1.0 E01033264 Sheffield 073D
2025-10-09 00:00:43 6.998 1.0 E01033264 Sheffield 073D

Aggregate data into time windows. The latest LSOA in the time window is selected. If the latest LSOA value in the time window is associated with poor quality data (e.g. after a time jump) then the most recent LSOA is taken, moving backwards.

agg_dict = {
    "distance": ["size", "sum"],
    "speed": ["mean"],
    "LSOA21CD": ["last"],
    "LSOA21NM": ["last"],
}
lsoa_agg = df_utils.apply_agg_over_interval(
    lsoa_gdf_clean, agg_dict=agg_dict, time_period=DEFAULT_TIME_PERIOD
)
lsoa_agg.head()
distance_size distance_sum speed_mean LSOA21CD_last LSOA21NM_last
datetime
2025-10-09 00:00:00 1.0 NaN NaN E01033264 Sheffield 073D
2025-10-09 00:05:00 7.0 66.978 1.0 E01033264 Sheffield 073D
2025-10-09 00:10:00 5.0 31.989 1.0 E01033270 Sheffield 022H
2025-10-09 01:00:00 18.0 172.938 1.0 E01008063 Sheffield 011B
2025-10-09 01:45:00 2.0 14.995 1.0 E01008138 Sheffield 006D

Rename columns for clarity

lsoa_agg_renamed = df_utils.rename_columns(
    df=lsoa_agg,
    col_names=[
        "LSOA_n_samples",
        "dist_travelled",
        "mean_speed",
        "latest_LSOA21CD",
        "latest_LSOA21NM",
    ],
)
lsoa_agg_renamed.head()
LSOA_n_samples dist_travelled mean_speed latest_LSOA21CD latest_LSOA21NM
datetime
2025-10-09 00:00:00 1.0 NaN NaN E01033264 Sheffield 073D
2025-10-09 00:05:00 7.0 66.978 1.0 E01033264 Sheffield 073D
2025-10-09 00:10:00 5.0 31.989 1.0 E01033270 Sheffield 022H
2025-10-09 01:00:00 18.0 172.938 1.0 E01008063 Sheffield 011B
2025-10-09 01:45:00 2.0 14.995 1.0 E01008138 Sheffield 006D

Save data

data_io.save_dataframe_ouput(lsoa_agg_renamed, FILEOUT_PATH)

Re-read in the data to check its integrity

test_df = data_io.load_data_from_file(FILEOUT_PATH)
test_df.head()
LSOA_n_samples dist_travelled mean_speed latest_LSOA21CD latest_LSOA21NM
datetime
2025-10-09 00:00:00 1.0 NaN NaN E01033264 Sheffield 073D
2025-10-09 00:05:00 7.0 66.978 1.0 E01033264 Sheffield 073D
2025-10-09 00:10:00 5.0 31.989 1.0 E01033270 Sheffield 022H
2025-10-09 01:00:00 18.0 172.938 1.0 E01008063 Sheffield 011B
2025-10-09 01:45:00 2.0 14.995 1.0 E01008138 Sheffield 006D