# GPS parsing example

In [None]:
import logging
from pathlib import Path

from bhf_smarthealth.parse import gps_parser as bhf_gps
from bhf_smarthealth.utils import data_io
from bhf_smarthealth.utils import df_utils
from bhf_smarthealth.config import DEFAULT_TIME_PERIOD

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Save to repo to use in testing
FILEOUT_PATH = Path("../../../tests/data/processed_synth_gps_2025-20-09.parquet")
LSOA_path = Path("../../../tests/data/lsoas_for_testing.pkl")

data_source = Path("../../../tests/data/synth_gps_2025-10-09.json")

Load raw JSON file

In [None]:
gps_json = bhf_gps.load_gps_json(data_source)


Validate raw data using Pydantic

In [None]:
gps_val = bhf_gps.validate_gps_data(gps_json)

Convert JSON dict to GeoDataframe

In [None]:
gps_gdf = bhf_gps.gps_to_dataframe(gps_val)

Convert raw GPS latitude and longitude to geometry

In [None]:
gps_gdf = bhf_gps.add_gps_as_geometry(gps_gdf)
gps_gdf.head()

Calculate distances

In [None]:
gps_gdf["distance"] = bhf_gps.calculate_distances(gps_gdf)
gps_gdf.head()

Calculate speeds

In [None]:
gps_gdf["speed"] = bhf_gps.calculate_speeds(gps_gdf)
gps_gdf.head()

GPS data cleaning

- Remove rows where speed between points is unreasonably large (> 130 kph/35 mps) 
- Where there is a large time jump, set distance and speed to NULL so not included in calculations

In [None]:
gps_gdf_clean = df_utils.apply_func_list_to_df(
    df=gps_gdf,
    func_list=[
        bhf_gps.filter_speeds,
        bhf_gps.filter_time_jumps,
    ],
)

gps_gdf_clean.head()

Load LSOAs and join 

In [None]:
lsoa_data = data_io.load_data_from_file(LSOA_path)

gps_lsoa_gdf = bhf_gps.join_spatial_data(gps_gdf_clean, lsoa_data)

gps_lsoa_gdf.head()

Drop uneeded columns

In [None]:
columns = ["lat", "long", "geometry", "index_right"]
lsoa_gdf = df_utils.remove_columns(gps_lsoa_gdf, col_names=columns)
lsoa_gdf.head()

Set LSOA values to be NULL at bad timepoints, i.e. where the distance is NULL. This is done separate to the time jump filter so that the LSOA value can be kept if needed down the line, for example if the modal LSOA needs to be calculated from the raw data rather than the aggregated interval data.

In [None]:
lsoa_gdf_clean = bhf_gps.nullify_lsoa_at_invalid_timepoints(lsoa_gdf)
lsoa_gdf_clean.head()

Aggregate data into time windows. The latest LSOA in the time window is selected. If the latest LSOA value in the time window is associated with poor quality data (e.g. after a time jump) then the most recent LSOA is taken, moving backwards.

In [None]:
agg_dict = {
    "distance": ["size", "sum"],
    "speed": ["mean"],
    "LSOA21CD": ["last"],
    "LSOA21NM": ["last"],
}
lsoa_agg = df_utils.apply_agg_over_interval(
    lsoa_gdf_clean, agg_dict=agg_dict, time_period=DEFAULT_TIME_PERIOD
)
lsoa_agg.head()


Rename columns for clarity

In [None]:
lsoa_agg_renamed = df_utils.rename_columns(
    df=lsoa_agg,
    col_names=[
        "LSOA_n_samples",
        "dist_travelled",
        "mean_speed",
        "latest_LSOA21CD",
        "latest_LSOA21NM",
    ],
)
lsoa_agg_renamed.head()

Save data

In [None]:
data_io.save_dataframe_ouput(lsoa_agg_renamed, FILEOUT_PATH)

Re-read in the data to check its integrity

In [None]:
test_df = data_io.load_data_from_file(FILEOUT_PATH)
test_df.head()