Full preprocessing pipeline

When we bring together all the steps introduced in the previous section, we can quickly produce the data set for our ML approach:

# Merge station meta data to station measurement table
station_data = station_data.merge(station_meta_data,on="icao").sort_values(by=["time","icao"]).reset_index(drop=True)
# Extract DEM pixel values at station locations (DEM is static, so this has to be done only once)
dem_px_values = dem.interp(x=xr.DataArray(station_data.x), y=xr.DataArray(station_data.y), method="nearest").values
# Extract all available time slots
time_slots = station_data.time.unique()
# Instantiate empty pandas-DataFrame
df = pd.DataFrame()

# Iterate over all available time slots
for time_slot in tqdm(time_slots):
    # Clip station table to current time slot
    station_data_current = station_data[station_data.time == time_slot]
    # Load MSG scene of current time slot
    satellite_data = xr.open_dataset(pd.to_datetime(time_slot).strftime("data/satellite/%Y/%Y%m%d_%H%M.nc"))
    # Extract MSG pixel values at station locations 
    station_px_values = satellite_data.interp(x=xr.DataArray(station_data_current.x), y=xr.DataArray(station_data_current.y), method="nearest").to_dataframe()
    # Merge satellite and station data into one DataFrame
    merged_data = station_data_current.merge(station_px_values,on=["x","y"])
    # Append the merged satellite and station data to the final DataFrame
    df = df.append(merged_data).reset_index(drop=True)
100%|██████████| 191/191 [00:37<00:00,  5.15it/s]
# Add DEM pixel values column to the final DataFrame
df["dem"] = dem_px_values

# Write the DataFrame to a CSV file