Machine learning data#

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.stats import circmean, mode
import matplotlib.dates as mdates

import warnings
warnings.filterwarnings('ignore')
StartDate='1990-01-01'
EndDate= '2023-12-31'

1. Atmosphere#

1.1 Daily resampling#

# Create 'output' folder if it doesn't exist
if not os.path.exists('input'):
    os.makedirs('input')

# Load the cleaned data
file_path = '../atmosphere/output/wind_data_cleaned.csv'
data = pd.read_csv(file_path, parse_dates=['Timestamp'], index_col='Timestamp')

# Drop NaNs to avoid invalid math operations
data = data.dropna(subset=['WDIR', 'WSPD'])

# Convert wind direction to radians
wind_direction_radians = np.radians(data['WDIR'])

# Compute weighted sine and cosine components
sin_component = np.sin(wind_direction_radians) * data['WSPD']
cos_component = np.cos(wind_direction_radians) * data['WSPD']

# Resample and compute the weighted mean of sine and cosine components
mean_sin = sin_component.resample('D').sum()
mean_cos = cos_component.resample('D').sum()

# Compute resultant vector length (magnitude) for normalization
total_wind_speed = data['WSPD'].resample('D').sum()

# Normalize vector length by total wind speed
mean_sin /= total_wind_speed
mean_cos /= total_wind_speed

# Use atan2 to calculate the directional mean (in degrees)
mean_direction = np.degrees(np.arctan2(mean_sin, mean_cos))
mean_direction = (mean_direction + 360) % 360  # Ensure values between 0–360

# Compute median manually using circular statistics
def circular_median(angles):
    if len(angles) == 0:
        return np.nan
    angles = np.radians(angles)
    sin_sum = np.sum(np.sin(angles))
    cos_sum = np.sum(np.cos(angles))
    median = np.arctan2(sin_sum, cos_sum)
    median = np.degrees(median) % 360
    return median

median_direction = data['WDIR'].resample('D').apply(circular_median)

# Compute mode using circular mode
mode_direction = data['WDIR'].resample('D').apply(
    lambda x: np.atleast_1d(mode(x, nan_policy='omit').mode)[0] if np.size(mode(x, nan_policy='omit').mode) > 0 else np.nan
)

# Resample other parameters using max
resampled_data = pd.DataFrame({
    'WDIR_mean': mean_direction,
    'WDIR_median': median_direction,
    'WDIR_mode': mode_direction,
    'WSPD': data['WSPD'].resample('D').max(),
    'ATMP': data['ATMP'].resample('D').max(),
    'WTMP': data['WTMP'].resample('D').max(),
})

# Rename the index column
resampled_data = resampled_data.rename_axis('time')

# Save to CSV
output_file = 'input/wind_daily.csv'
resampled_data.to_csv(output_file)

print(f"\nResampled data saved to '{output_file}'")
#display(resampled_data)
Resampled data saved to 'input/wind_daily.csv'

1.2 Weekly resampling#

# Create 'output' folder if it doesn't exist
if not os.path.exists('input'):
    os.makedirs('input')

# Load the cleaned data
file_path = '../atmosphere/output/wind_data_cleaned.csv'
data = pd.read_csv(file_path, parse_dates=['Timestamp'], index_col='Timestamp')

# Drop NaNs to avoid invalid math operations
data = data.dropna(subset=['WDIR', 'WSPD'])

# Define weekly date range
weekly_date_range = pd.date_range(start=StartDate, end=EndDate, freq='W-MON')

# Initialize DataFrame with this weekly index
weekly_wind = pd.DataFrame(index=weekly_date_range)
weekly_wind.index.name = 'Timestamp'

# Convert wind direction to radians
wind_direction_radians = np.radians(data['WDIR'])

# Compute weighted sine and cosine components
sin_component = np.sin(wind_direction_radians) * data['WSPD']
cos_component = np.cos(wind_direction_radians) * data['WSPD']

# Resample to weekly
mean_sin_weekly = sin_component.resample('W-MON').sum()
mean_cos_weekly = cos_component.resample('W-MON').sum()

# Normalize the resultant vector lengths
total_wind_speed_weekly = data['WSPD'].resample('W-MON').sum()
mean_sin_weekly /= total_wind_speed_weekly
mean_cos_weekly /= total_wind_speed_weekly

# Calculate directional mean (in degrees)
mean_direction_weekly = np.degrees(np.arctan2(mean_sin_weekly, mean_cos_weekly))
mean_direction_weekly = (mean_direction_weekly + 360) % 360

# Compute median using circular statistics
def circular_median(angles):
    if len(angles) == 0:
        return np.nan
    angles = np.radians(angles)
    sin_sum = np.sum(np.sin(angles))
    cos_sum = np.sum(np.cos(angles))
    median = np.arctan2(sin_sum, cos_sum)
    median = np.degrees(median) % 360
    return median

median_direction_weekly = data['WDIR'].resample('W-MON').apply(circular_median)

# Compute circular mode of wind direction
mode_direction_weekly = data['WDIR'].resample('W-MON').apply(
    lambda x: np.atleast_1d(mode(x, nan_policy='omit').mode)[0] if np.size(mode(x, nan_policy='omit').mode) > 0 else np.nan
)

# Compile resampled data
weekly_wind['WDIR_mean'] = mean_direction_weekly
weekly_wind['WDIR_median'] = median_direction_weekly
weekly_wind['WDIR_mode'] = mode_direction_weekly
weekly_wind['WSPD'] = data['WSPD'].resample('W-MON').max()
weekly_wind['ATMP'] = data['ATMP'].resample('W-MON').max()
weekly_wind['WTMP'] = data['WTMP'].resample('W-MON').max()

# Rename the index column
weekly_wind = weekly_wind.rename_axis('time')



# Create a 3x1 grid of subplots
fig, axs = plt.subplots(3, 1, figsize=(10, 15))

# Plot WDIR_mean, WDIR_median, WDIR_mode on the first subplot
axs[0].plot(weekly_wind.index, weekly_wind['WDIR_mean'], label='WDIR_mean', color='b')
axs[0].plot(weekly_wind.index, weekly_wind['WDIR_median'], label='WDIR_median', color='g')
axs[0].plot(weekly_wind.index, weekly_wind['WDIR_mode'], label='WDIR_mode', color='r')
axs[0].set_title('Wind Direction Measures')
axs[0].set_ylabel('Degrees')
axs[0].legend(loc='best')

# Plot WSPD on the second subplot
axs[1].plot(weekly_wind.index, weekly_wind['WSPD'], color='c')
axs[1].set_title('Wind Speed')
axs[1].set_ylabel('Speed (m/s)')

# Plot ATMP and WTMP on the third subplot
axs[2].plot(weekly_wind.index, weekly_wind['ATMP'], label='ATMP', color='m')
axs[2].plot(weekly_wind.index, weekly_wind['WTMP'], label='WTMP', color='y')
axs[2].set_title('Air and Water Temperature')
axs[2].set_ylabel('Temperature (°C)')
axs[2].legend(loc='best')

# Adjust layout for better fit
plt.tight_layout()

# Show the plots
plt.show()

# Save to CSV
output_file_weekly = 'input/wind_weekly.csv'
weekly_wind.to_csv(output_file_weekly)

print(f"\nResampled weekly data saved to '{output_file_weekly}'")
display(weekly_wind)
../_images/3e2b92e1e5b6da67b4bd8f48a1aceaca2524d2d386d50ef67551fdf2c032f978.png
Resampled weekly data saved to 'input/wind_weekly.csv'
WDIR_mean WDIR_median WDIR_mode WSPD ATMP WTMP
time
1990-01-01 9.895843 328.282768 15.0 12.2 25.3 26.1
1990-01-08 133.922631 137.632056 113.0 12.9 26.1 26.1
1990-01-15 44.485475 35.621339 95.0 11.3 23.4 26.1
1990-01-22 113.017005 113.548079 105.0 11.0 26.1 26.6
1990-01-29 98.798579 109.931826 117.0 16.4 25.6 26.1
... ... ... ... ... ... ...
2023-11-27 NaN NaN NaN NaN NaN NaN
2023-12-04 NaN NaN NaN NaN NaN NaN
2023-12-11 NaN NaN NaN NaN NaN NaN
2023-12-18 NaN NaN NaN NaN NaN NaN
2023-12-25 NaN NaN NaN NaN NaN NaN

1774 rows × 6 columns

2. Ocean#

2.1 HAB#

2.1.1 Daily resampling max#

import pandas as pd

# Reading the CSV file
file_path = "../ocean/output/hab_charlotte_harbor_all.csv"
df = pd.read_csv(file_path)

# Converting SAMPLE_DATE to datetime, set as index, rename index
df['SAMPLE_DATE'] = pd.to_datetime(df['SAMPLE_DATE'])
df.set_index('SAMPLE_DATE', inplace=True)

# Create the 'num_samples' column
df['samples'] = df.groupby(df.index).size()

# Create the 'num_high_samples' column using the selection for high CELLCOUNT
high_count_criteria = df['CELLCOUNT'] > 1e5

# Filter the original DataFrame for only the high CELLCOUNT entries, then groupby date and count them
num_high_samples = (df[high_count_criteria].groupby(df[high_count_criteria].index).size())


# Reindex to match the original DataFrame; fill missing with 0 and convert to integer type
df['samples_1e5'] = num_high_samples.reindex(df.index, fill_value=0).astype(int)

# Save to CSV
output_file = 'input/hab_daily_all.csv'
df.to_csv(output_file)

# # Displaying the first few rows of the DataFrame to ensure it's set up correctly
# display(df)

# Sort the DataFrame by the index ('SAMPLE_DATE') and 'CELLCOUNT'
df_sorted = df.sort_values(['SAMPLE_DATE', 'CELLCOUNT'], ascending=[True, False])

# Use groupby on the index converted to date to find the maximum per day
df_daily_max = df_sorted.groupby(df_sorted.index.date, group_keys=False).apply(lambda x: x.head(1))

# Reset index to keep 'SAMPLE_DATE' as the index again, now only with daily max
df_daily_max.index = pd.to_datetime(df_daily_max.index)
df_daily_max.index.name = 'time'

# Limit the DataFrame to the specified date range
df_daily_max = df_daily_max.loc[StartDate:EndDate]

# Save to CSV
output_file = 'input/hab_daily.csv'
df_daily_max.to_csv(output_file)

# Display the first few rows to verify the result
print(f"\nResampled weekly data saved to '{output_file}'")
#display(df_daily_max)
Resampled weekly data saved to 'input/hab_daily.csv'

2.1.2 Weekly resampling#

# Reading the CSV file
file_path = "../ocean/output/hab_charlotte_harbor_all.csv"
df = pd.read_csv(file_path)

# Convert SAMPLE_DATE to datetime
df['SAMPLE_DATE'] = pd.to_datetime(df['SAMPLE_DATE'])

# Set SAMPLE_DATE as the index
df.set_index('SAMPLE_DATE', inplace=True)

# Resample and compute maximum values for CELLCOUNT, SALINITY, and WATER_TEMP
weekly_hab = df.resample('W-MON').agg({
    'CELLCOUNT': 'max',
    'SALINITY': 'max',
    'WATER_TEMP': 'max'
})

# Create a complete weekly date range
complete_date_range = pd.date_range(start=StartDate, end=EndDate, freq='W-MON')

# Reindex the dataframe to this complete date range, filling missing data with NaNs
weekly_hab = weekly_hab.reindex(complete_date_range)

# Reset index to have the week start date as a column
weekly_hab.reset_index(inplace=True)
weekly_hab.rename(columns={'index': 'WEEK_START'}, inplace=True)

# Set WEEK_START as the index and rename it to 'time'
weekly_hab.set_index('WEEK_START', inplace=True)
weekly_hab.index.name = 'time'


# Create a 3x1 grid of subplots
fig, axs = plt.subplots(3, 1, figsize=(10, 15))

# Plot CELLCOUNT on the first subplot
axs[0].plot(weekly_hab.index, weekly_hab['CELLCOUNT'], color='b')
axs[0].set_title('Cell Count')
axs[0].set_ylabel('Cells per Unit Liter')

# Plot SALINITY on the second subplot
axs[1].plot(weekly_hab.index, weekly_hab['SALINITY'], color='g')
axs[1].set_title('Salinity')
axs[1].set_ylabel('Salinity (PSU)')

# Plot WATER_TEMP on the third subplot
axs[2].plot(weekly_hab.index, weekly_hab['WATER_TEMP'], color='r')
axs[2].set_title('Water Temperature')
axs[2].set_ylabel('Temperature (°C)')

# Adjust layout for better fit
plt.tight_layout()

# Show the plots
plt.show()

# Save to CSV
output_file = 'input/hab_weekly.csv'
weekly_hab.to_csv(output_file)

# Display the first few rows to verify the result
print(f"\nResampled weekly data saved to '{output_file}'")
display(weekly_hab)
../_images/d01d9e7797ad14398b7cf60af2e7f7919620d1b7b497b9a912aec74f6d8c16c9.png
Resampled weekly data saved to 'input/hab_weekly.csv'
CELLCOUNT SALINITY WATER_TEMP
time
1990-01-01 NaN NaN NaN
1990-01-08 NaN NaN NaN
1990-01-15 NaN NaN NaN
1990-01-22 NaN NaN NaN
1990-01-29 NaN NaN NaN
... ... ... ...
2023-11-27 NaN NaN NaN
2023-12-04 NaN NaN NaN
2023-12-11 NaN NaN NaN
2023-12-18 NaN NaN NaN
2023-12-25 NaN NaN NaN

1774 rows × 3 columns

2.2 zos#

2.2.1 Weekly resampling#

# Reading the CSV file and parsing the dates
file_path = "../ocean/output/zos_daily.csv"
df = pd.read_csv(file_path, parse_dates=['time'], index_col='time')

# Resample zos to compute the weekly mean
zos_weekly_mean = df['zos'].resample('W-MON').mean()

# Resample detrended_zos using max
detrended_zos_mean = df['detrended_zos'].resample('W-MON').mean()

# Resample detrended_zos using max
detrended_zos_max = df['detrended_zos'].resample('W-MON').max()

# Resample detrended_zos using median
detrended_zos_median = df['detrended_zos'].resample('W-MON').median()

# Generate a complete weekly date range for the entire period
weekly_range = pd.date_range(start=StartDate, end=EndDate, freq='W-MON')

# Combine everything into a single DataFrame ensuring all weeks are present
weekly_zos = pd.DataFrame({
    'zos_mean': zos_weekly_mean,
    'detrended_zos_mean': detrended_zos_mean,
    'detrended_zos_max': detrended_zos_max,
    'detrended_zos_median': detrended_zos_median
}, index=weekly_range)

# Rename the index column
weekly_zos = weekly_zos.rename_axis('time')

# Limit the DataFrame to the specified date range
weekly_zos = weekly_zos.loc[StartDate:EndDate]


# Create a 2x1 grid of subplots
fig, axs = plt.subplots(2, 1, figsize=(12, 10))

# Plot zos_mean on the first subplot
axs[0].plot(weekly_zos.index, weekly_zos['zos_mean'], color='b', label='ZOS Mean')
axs[0].set_title('ZOS Mean Over Time')
axs[0].set_ylabel('ZOS Mean (m)')
axs[0].legend(loc='best')

# Plot detrended_zos_mean, detrended_zos_max, and detrended_zos_median on the second subplot
axs[1].plot(weekly_zos.index, weekly_zos['detrended_zos_mean'], color='g', label='Detrended ZOS Mean')
axs[1].plot(weekly_zos.index, weekly_zos['detrended_zos_max'], color='r', label='Detrended ZOS Max')
axs[1].plot(weekly_zos.index, weekly_zos['detrended_zos_median'], color='m', label='Detrended ZOS Median')
axs[1].set_title('Detrended ZOS Measures Over Time')
axs[1].set_ylabel('Detrended ZOS (m)')
axs[1].legend(loc='best')

# Adjust layout for better fit
plt.tight_layout()

# Show the plots
plt.show()

# Save the resulting DataFrame to a new CSV file
output_file = "input/zos_weekly.csv"
weekly_zos.to_csv(output_file)

print(f"Extended resampled data saved to '{output_file}'")
display(weekly_zos)
../_images/899e81d4447efd8b7986fe04989f35fe86bcdd470de799fabbd680c12030da41.png
Extended resampled data saved to 'input/zos_weekly.csv'
zos_mean detrended_zos_mean detrended_zos_max detrended_zos_median
time
1990-01-01 NaN NaN NaN NaN
1990-01-08 NaN NaN NaN NaN
1990-01-15 NaN NaN NaN NaN
1990-01-22 NaN NaN NaN NaN
1990-01-29 NaN NaN NaN NaN
... ... ... ... ...
2023-11-27 0.138694 -0.005114 0.003364 -0.005671
2023-12-04 0.138997 -0.003981 0.004709 -0.004344
2023-12-11 0.133151 -0.009013 -0.005218 -0.007787
2023-12-18 0.146926 0.005557 0.025714 0.008546
2023-12-25 0.141837 0.001243 0.014987 0.002676

1774 rows × 4 columns

3. River#

3.1 Peace river discharge#

# Reading the CSV file and parsing the dates
file_path = "../river/input/peace_river_discharge_arcadia.csv"
df = pd.read_csv(file_path, parse_dates=['datetime'], index_col='datetime')

# Ensure the DataFrame is sorted by the datetime index
df.sort_index(inplace=True)

# Resample the data to weekly frequency and calculate mean and max
weekly_pc_dis = df['00060'].resample('W-MON').agg(['mean', 'max'])

# Limit the DataFrame to the specified date range
weekly_pc_dis = weekly_pc_dis.loc[StartDate:EndDate]

# Rename the columns to match the desired output format
weekly_pc_dis.rename(columns={'mean': 'discharge_mean', 'max': 'discharge_max'}, inplace=True)

# Create additional columns for deviations from the mean
weekly_pc_dis['discharge_meanDev'] = weekly_pc_dis['discharge_mean'] - weekly_pc_dis['discharge_mean'].mean()
weekly_pc_dis['discharge_maxDev'] = weekly_pc_dis['discharge_max'] - weekly_pc_dis['discharge_max'].mean()

# Rename the index to 'time'
# Convert the datetime index to a date format (YYYY-MM-DD)
weekly_pc_dis.index.name = 'time'
weekly_pc_dis.index = weekly_pc_dis.index.strftime('%Y-%m-%d')
weekly_pc_dis.index = pd.to_datetime(weekly_pc_dis.index)

# Save the resulting DataFrame to a new CSV file
output_file = "input/discharge_peace_weekly.csv"
weekly_pc_dis.to_csv(output_file)

print(f"Resampled data saved to '{output_file}'")
display(weekly_pc_dis)
Resampled data saved to 'input/discharge_peace_weekly.csv'
discharge_mean discharge_max discharge_meanDev discharge_maxDev
time
1990-01-01 NaN NaN NaN NaN
1990-01-08 NaN NaN NaN NaN
1990-01-15 NaN NaN NaN NaN
1990-01-22 NaN NaN NaN NaN
1990-01-29 NaN NaN NaN NaN
... ... ... ... ...
2023-11-27 273.932127 326.0 -749.387421 -1041.917759
2023-12-04 210.825472 231.0 -812.494076 -1136.917759
2023-12-11 183.182371 201.0 -840.137177 -1166.917759
2023-12-18 204.811263 460.0 -818.508284 -907.917759
2023-12-25 344.843609 470.0 -678.475939 -897.917759

1774 rows × 4 columns

Plot discharge#

# Step 3: Create a 2x1 grid of subplots
common_y_range = (-1200, 5.3e4)
fig, axs = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

# Step 4: Plot discharge_mean on the first subplot
axs[0].plot(weekly_pc_dis.index, weekly_pc_dis['discharge_mean'], color='b', label='Discharge Mean')
axs[0].set_title('Discharge Mean Over Time')
axs[0].set_ylabel('Discharge (cubic feet per second)')
axs[0].set_ylim(common_y_range)
axs[0].legend(loc='best')

# Step 5: Plot discharge_max on the second subplot
axs[1].plot(weekly_pc_dis.index, weekly_pc_dis['discharge_max'], color='r', label='Discharge Max')
axs[1].set_title('Discharge Max Over Time')
axs[1].set_ylabel('Discharge Deviations (cubic feet per second)')
axs[1].set_ylim(common_y_range)
axs[1].legend(loc='best')

# # Step 6: Set x-axis range manually (force matplotlib to obey)
# start_date = weekly_pc_dis_nan.index.min()
# end_date = weekly_pc_dis_nan.index.max()
# axs[0].set_xlim([start_date, end_date])
# axs[1].set_xlim([start_date, end_date])

# Step 7: Improve x-axis formatting
locator = mdates.YearLocator(4)  # Every 5 years
formatter = mdates.DateFormatter('%Y')

for ax in axs:
    ax.xaxis.set_major_locator(locator)
    ax.xaxis.set_major_formatter(formatter)
    ax.tick_params(axis='x', rotation=45)

# Step 8: Fix layout and overlap issues
plt.tight_layout()

# Step 9: Show plot
plt.show()
../_images/7a04f63107c617daae7126a721978f783044dd37e9ad86c45961bfe4f62d506e.png

3.2 Peace river TN and TP#

basinID='03100101'
file_path = f'../river/input/HU8_{basinID}_TN_TP.csv'  # Adjust path if needed
df = pd.read_csv(file_path)

# Separate TN and TP data
tn_data = df[df['Parameter'].isin(['TN_mgl', 'TN_ugl'])]
tp_data = df[df['Parameter'].isin(['TP_mgl', 'TP_ugl'])]

# Convert concentration to consistent units (mg/L) if necessary
tn_data.loc[tn_data['Parameter'] == 'TN_ugl', 'Result_Value'] /= 1000
tp_data.loc[tp_data['Parameter'] == 'TP_ugl', 'Result_Value'] /= 1000

# Ensure 'SampleDate' is converted to datetime
tn_data['SampleDate'] = pd.to_datetime(tn_data['SampleDate'])
tp_data['SampleDate'] = pd.to_datetime(tp_data['SampleDate'])

# Sort the data by 'SampleDate', set 'SampleDate' as the index and rename it to 'time'
tn_data = tn_data.sort_values('SampleDate').set_index('SampleDate').rename_axis('time')
tp_data = tp_data.sort_values('SampleDate').set_index('SampleDate').rename_axis('time')

# Lower and upper data

# Create a new column in tp_data
tn_data['Lat_TN'] = np.where(tn_data['Actual_Latitude'] <= 27.4, 0, 1)
tp_data['Lat_TP'] = np.where(tp_data['Actual_Latitude'] <= 27.4, 0, 1)

# Select relevant columns (Result_Value) from each DataFrame
tn_results = tn_data[['Result_Value','Lat_TN']].rename(columns={'Result_Value': 'TN'})
tp_results = tp_data[['Result_Value','Lat_TP']].rename(columns={'Result_Value': 'TP'})

# Join both DataFrames on their time index
data = tn_results.join(tp_results, how='outer') 

# Display the combined DataFrame
#display(data.loc[StartDate:EndDate])
# Fill NaN values before mapping to colors
# You can choose to fill with a specific category, here I am adding a new category for NaN values as 'grey'
data['Lat_TN_filled'] = data['Lat_TN'].fillna(-1).map({0: 'blue', 1: 'red', -1: 'black'})
data['Lat_TP_filled'] = data['Lat_TP'].fillna(-1).map({0: 'blue', 1: 'red', -1: 'black'})

# Create a 2x1 grid of subplots
fig, axs = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

# Scatter plot for TN
lat_tn_colors = data['Lat_TN_filled']
axs[0].scatter(data.index, data['TN'], s=10, c=lat_tn_colors)
axs[0].set_title('TN Over Time')
axs[0].set_ylabel('Value')
axs[0].xaxis.set_major_locator(mdates.YearLocator(4))  # Set major ticks every 4 years
axs[0].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format ticks as years
axs[0].legend(handles=[
    plt.Line2D([0], [0], marker='o', color='w', label='Lat_TN=0', markerfacecolor='blue', markersize=5),
    plt.Line2D([0], [0], marker='o', color='w', label='Lat_TN=1', markerfacecolor='red', markersize=5),
], loc='upper right')

# Scatter plot for TP
lat_tp_colors = data['Lat_TP_filled']
axs[1].scatter(data.index, data['TP'], s=10, c=lat_tp_colors)
axs[1].set_title('TP Over Time')
axs[1].set_ylabel('Value')
axs[1].set_xlabel('Year')
axs[1].legend(handles=[
    plt.Line2D([0], [0], marker='o', color='w', label='Lat_TP=0', markerfacecolor='blue', markersize=5),
    plt.Line2D([0], [0], marker='o', color='w', label='Lat_TP=1', markerfacecolor='red', markersize=5),
], loc='upper right')
axs[1].xaxis.set_major_locator(mdates.YearLocator(4))  # Set major ticks every 4 years
axs[1].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format ticks as years

# Rotate date labels for better visibility
plt.setp(axs[1].xaxis.get_majorticklabels(), rotation=45)

# Adjust layout for better fit
plt.tight_layout()

# Show the plots
plt.show()
../_images/2d20c49360049c6223e6a8ce75be1207c1f917ddaca350e9107c2f98678dd5cc.png
# Resample the original data by week, calculating both mean and max
resampled_mean = data.resample('W-MON').mean(numeric_only=True) \
.rename(columns={'TN': 'TN_mean','TP': 'TP_mean'})
resampled_max = data.resample('W-MON').max(numeric_only=True)  \
.rename(columns={'TN': 'TN_max','TP': 'TP_max'})

# Slice the resampled data for the desired date range
weekly_mean = resampled_mean.loc[StartDate:EndDate]
weekly_max = resampled_max.loc[StartDate:EndDate]

# Create the weekly_pc_nut DataFrame
weekly_pc_nut = pd.DataFrame(index=weekly_mean.index)
weekly_pc_nut['TN_mean'] = weekly_mean['TN_mean']
weekly_pc_nut['TN_max'] = weekly_max['TN_max']
weekly_pc_nut['TP_mean'] = weekly_mean['TP_mean']
weekly_pc_nut['TP_max'] = weekly_max['TP_max']

display(weekly_pc_nut)
TN_mean TN_max TP_mean TP_max
time
1990-01-01 NaN NaN NaN NaN
1990-01-08 NaN NaN NaN NaN
1990-01-15 NaN NaN NaN NaN
1990-01-22 NaN NaN NaN NaN
1990-01-29 1.980000 1.980 NaN NaN
... ... ... ... ...
2023-11-27 NaN NaN NaN NaN
2023-12-04 2.180000 2.180 0.800000 0.80
2023-12-11 1.840000 1.840 0.790000 0.79
2023-12-18 1.956353 2.208 0.884941 0.97
2023-12-25 NaN NaN NaN NaN

1774 rows × 4 columns

# Create a 2x1 grid of subplots
fig, axs = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

# Scatter plot for TN
axs[0].scatter(weekly_pc_nut.index, weekly_pc_nut['TN_max'], s=10, c='red',label='max')
axs[0].scatter(weekly_pc_nut.index, weekly_pc_nut['TN_mean'], s=10, c='blue',label='mean')
axs[0].set_title('TN Over Time')
axs[0].set_ylabel('Value')
axs[0].xaxis.set_major_locator(mdates.YearLocator(4))  # Set major ticks every 4 years
axs[0].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format ticks as years

# Scatter plot for TP
axs[1].scatter(weekly_pc_nut.index, weekly_pc_nut['TP_max'], s=10, c='red',label='max')
axs[1].scatter(weekly_pc_nut.index, weekly_pc_nut['TP_mean'], s=10, c='blue',label='mean')
axs[1].set_title('TP Over Time')
axs[1].set_ylabel('Value')
axs[1].set_xlabel('Year')
axs[1].xaxis.set_major_locator(mdates.YearLocator(4))  # Set major ticks every 4 years
axs[1].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format ticks as years

# Rotate date labels for better visibility
plt.setp(axs[1].xaxis.get_majorticklabels(), rotation=45)

# Adjust layout for better fit
plt.tight_layout()

# Show the plots
plt.show()
../_images/196856ca696511d8153c86497cf450e167da860cbdf921190b5660a341bce5a7.png

4. Collect dataset#

4.1 Display data#

Display data to select columns to be combined and check start and end date

#Display datasets
rows_to_show=[0,1,-2, -1]
display(weekly_wind.iloc[rows_to_show], weekly_wind.shape)
display(weekly_hab.iloc[rows_to_show], weekly_hab.shape)
display(weekly_zos.iloc[rows_to_show], weekly_zos.shape)
display(weekly_pc_dis.iloc[rows_to_show], weekly_pc_dis.shape)
display(weekly_pc_nut.iloc[rows_to_show], weekly_pc_nut.shape)
WDIR_mean WDIR_median WDIR_mode WSPD ATMP WTMP
time
1990-01-01 9.895843 328.282768 15.0 12.2 25.3 26.1
1990-01-08 133.922631 137.632056 113.0 12.9 26.1 26.1
2023-12-18 NaN NaN NaN NaN NaN NaN
2023-12-25 NaN NaN NaN NaN NaN NaN
(1774, 6)
CELLCOUNT SALINITY WATER_TEMP
time
1990-01-01 NaN NaN NaN
1990-01-08 NaN NaN NaN
2023-12-18 NaN NaN NaN
2023-12-25 NaN NaN NaN
(1774, 3)
zos_mean detrended_zos_mean detrended_zos_max detrended_zos_median
time
1990-01-01 NaN NaN NaN NaN
1990-01-08 NaN NaN NaN NaN
2023-12-18 0.146926 0.005557 0.025714 0.008546
2023-12-25 0.141837 0.001243 0.014987 0.002676
(1774, 4)
discharge_mean discharge_max discharge_meanDev discharge_maxDev
time
1990-01-01 NaN NaN NaN NaN
1990-01-08 NaN NaN NaN NaN
2023-12-18 204.811263 460.0 -818.508284 -907.917759
2023-12-25 344.843609 470.0 -678.475939 -897.917759
(1774, 4)
TN_mean TN_max TP_mean TP_max
time
1990-01-01 NaN NaN NaN NaN
1990-01-08 NaN NaN NaN NaN
2023-12-18 1.956353 2.208 0.884941 0.97
2023-12-25 NaN NaN NaN NaN
(1774, 4)

4.2 Interpolate temperature data from NOAA Bouy and KB data#

# Define the figure and subplots
fig, axs = plt.subplots(2, 1, figsize=(12, 8))
# weekly_wind: WDIR_mean	WDIR_median	WDIR_mode	WSPD	ATMP	WTMP
# weekly_hab: CELLCOUNT	SALINITY	WATER_TEMP

# Plot water_temp_kb and water_temp on the first subplot
axs[0].plot(weekly_hab.index, weekly_hab['WATER_TEMP'], label='Water Temp KB Samples', color='b')
axs[0].plot(weekly_wind.index, weekly_wind['WTMP'], label='Water Temp Bouy', color='r')
axs[0].set_title('Water Temperatures Over Time')
axs[0].set_ylabel('Temperature')
axs[0].legend(loc='best')

# Plot the difference on the second subplot
axs[1].plot(weekly_hab.index, weekly_hab['WATER_TEMP'] - weekly_wind['WTMP'], label='Difference (KB Samples- NOAA Bouy)', color='g')
axs[1].set_title('Difference Between Water Temperatures')
axs[1].set_ylabel('Temperature Difference')
axs[1].legend(loc='best')

# Adjust layout for better fit and display the plot
plt.tight_layout()
plt.show()
../_images/1784273205621e7d923db7ba86e16a5f24ba976b6e80df2e76aa4b9495b224e6.png
# weekly_wind: WDIR_mean	WDIR_median	WDIR_mode	WSPD	ATMP	WTMP
# weekly_hab: CELLCOUNT	SALINITY	WATER_TEMP

weekly_wind['water_temp_ln'] = weekly_wind['WTMP'].interpolate(method='linear')

# Fill with mean for the same week across years
weekly_wind['water_temp_sn'] =  weekly_wind['WTMP'].fillna(
    weekly_wind.groupby(weekly_wind.index.isocalendar().week)['WTMP'].transform('mean')
)
# df_combined['water_temp_mix']=df_combined['water_temp_kb'].interpolate(method='linear')
# df_combined.loc[StartDate:'2021-12-06', 'water_temp_mix']= df_combined.loc[StartDate:'2021-12-06','water_temp_sn'] 

# Define the figure and subplots
fig, axs = plt.subplots(1, 1, figsize=(12, 8))
# Plot water_temp_kb and water_temp on the first subplot
axs.plot(weekly_wind.index, weekly_wind['water_temp_ln'], label='Water Temp Linear Interpolation', color='b')
axs.plot(weekly_wind.index, weekly_wind['water_temp_sn'], label='Water Temp Seasonal Interpolation', color='k')
#axs.plot(df_combined.index, df_combined['water_temp_mix'], label='Water Temp Mixed Interpolation', color='g')
axs.plot(weekly_wind.index, weekly_wind['WTMP'], label='Water Temp NOAA Bouy', color='r')
axs.set_title('Water Temperatures Over Time')
axs.set_ylabel('Temperature')
axs.legend(loc='best')
<matplotlib.legend.Legend at 0x1cb48f016d0>
../_images/2dfaf88ed5474bb92ebfb65f0b3150d74e988f46fb9696191a2dc7f4cce5894f.png

5. Combine datasets#

Combine datasets into one dataFrame for based on data selected for each feature, which can result in multiple datasets

5.1 Select columns to be combinded#

def combine(var_kb, var_sal, var_temp, var_wdir, var_wspd, var_zos, var_dis_pc, var_TN_pc, var_TP_pc):
    df_combined = pd.DataFrame()
    
    # CELLCOUNT -> kb
    df_combined['kb'] = weekly_hab[var_kb]
    
    # 1) SALINITY -> salinity
    df_combined['salinity'] = weekly_hab[var_sal]
    
    # 2) WTMP -> water_temp
    df_combined['water_temp'] = weekly_wind[var_temp]
    
    # 3) WDIR_mean OR WDIR_median OR WDIR_mode -> wind_direction
    var='WDIR_mode'  #'WDIR_mean', 'WDIR_median', 'WDIR_mode'
    df_combined['wind_direction'] = weekly_wind[var_wdir]
    
    # 4) WSPD -> wind_speed
    df_combined['wind_speed'] = weekly_wind[var_wspd]
    
    # 5) zos_mean OR detrended_zos_mean OR detrended_zos_max OR detrended_zos_median -> zos sea surface height
    #var= 'detrended_zos_mean' #'zos_mean','detrended_zos_mean', 'detrended_zos_max', 'detrended_zos_median'
    df_combined['zos'] = weekly_zos[var_zos]
    
    # 6) discharge_mean OR discharge_max OR discharge_meanDev OR discharge_maxDev -> peace river discharge
    #var= 'discharge_max'#'discharge_mean', 'discharge_max', 'discharge_meanDev', 'discharge_maxDev'
    df_combined['peace_discharge'] = weekly_pc_dis[var_dis_pc]
    
    # 7) TN_mean OR TN_max -> peace river TN
    #var='TN_max' # 'TN_mean', 'TN_max'  
    df_combined['peace_TN'] = weekly_pc_nut[var_TN_pc]
    
    # 8) TP_mean OR TP_max -> peace_TP
    #var='TP_max' # 'TP_mean', 'TP_max'  
    df_combined['peace_TP'] = weekly_pc_nut[var_TP_pc]
    
    # Ensure the index is named `time` if it's not already
    df_combined.index.name = 'time'

    return df_combined
# Define a dictionary to hold column selections for each category
column_selections = {
    'kb': ['CELLCOUNT'],
    'wsalinity': ['SALINITY'],
    'wtemp': ['water_temp_sn'],
    'wdir': ['WDIR_mean', 'WDIR_median', 'WDIR_mode'],
    'wspeed': ['WSPD'],
    'zos': ['zos_mean', 'detrended_zos_mean', 'detrended_zos_max', 'detrended_zos_median'],
    'dis_pc': ['discharge_mean', 'discharge_max', 'discharge_meanDev', 'discharge_maxDev'],
    'TN_pc': ['TN_mean', 'TN_max'],
    'TP_pc': ['TP_mean', 'TP_max']
}

# Define the indices for selection
select = [0, 0, 0, 2, 0, 1, 1, 1, 1]

# Combine the selected columns into a DataFrame
df_combined = combine(
    *[column_selections[key][select[i]] for i, key in enumerate(column_selections)]
)

# Create the filename based on selection indices
filename_indices = ''.join(f'{key[1].upper()}{select[i]}' for i, key in enumerate(column_selections))
output_file = f"input/data_weekly_{filename_indices}.csv"

# Save the resulting DataFrame to a new CSV file
df_combined.to_csv(output_file)

print(f"Combined data saved to '{output_file}'")
display(df_combined)
Combined data saved to 'input/data_weekly_B0S0T0D2S0O1I1N1P1.csv'
kb salinity water_temp wind_direction wind_speed zos peace_discharge peace_TN peace_TP
time
1990-01-01 NaN NaN 26.100000 15.0 12.2 NaN NaN NaN NaN
1990-01-08 NaN NaN 26.100000 113.0 12.9 NaN NaN NaN NaN
1990-01-15 NaN NaN 26.100000 95.0 11.3 NaN NaN NaN NaN
1990-01-22 NaN NaN 26.600000 105.0 11.0 NaN NaN NaN NaN
1990-01-29 NaN NaN 26.100000 117.0 16.4 NaN NaN 1.980 NaN
... ... ... ... ... ... ... ... ... ...
2023-11-27 NaN NaN 27.110714 NaN NaN -0.005114 326.0 NaN NaN
2023-12-04 NaN NaN 26.992857 NaN NaN -0.003981 231.0 2.180 0.80
2023-12-11 NaN NaN 26.528571 NaN NaN -0.009013 201.0 1.840 0.79
2023-12-18 NaN NaN 26.207143 NaN NaN 0.005557 460.0 2.208 0.97
2023-12-25 NaN NaN 26.053571 NaN NaN 0.001243 470.0 NaN NaN

1774 rows × 9 columns

5.2 Interpolate data#

  • Linear interpolation: salinty, wind_direction, wind_speed, peace_discharge, peace_TN, peace_TP,

  • Fill with zero: KB

df_interpol = df_combined[['kb', 'zos', 'salinity', 'water_temp', 
                             'wind_direction', 'wind_speed', 
                             'peace_discharge', 'peace_TN', 'peace_TP']].copy()

# Rename 'water_temp_sn' to 'water_temp'
df_interpol = df_interpol.rename(columns={'water_temp_sn': 'water_temp'})

# Fill NaN in 'kb' with 0
df_interpol['kb'].fillna(0, inplace=True)

# Linear interpolation on specified columns
df_interpol[['salinity', 'peace_discharge', 'peace_TN', 'peace_TP']]  \
            = df_interpol[['salinity', 'peace_discharge', 'peace_TN', 'peace_TP']].interpolate(method='linear')


# Fill with mean for the same week across years
df_interpol['wind_direction'] = df_interpol['wind_direction'].fillna(
    df_combined.groupby(df_combined.index.isocalendar().week)['wind_direction'].transform('mean'))

df_interpol['wind_speed'] = df_interpol['wind_speed'].fillna(
    df_combined.groupby(df_combined.index.isocalendar().week)['wind_speed'].transform('mean'))



#Set start date and end date
start_date='1993-01-01'
end_date='2023-12-31'
df_interpol=df_interpol.loc[start_date:end_date,:]

# Save and display df_interpol
print(f"Intepolated data saved to '{output_file}'")
df_interpol.to_csv('input/data_weekly_intepolated.csv')

# Displaying the DataFrame
display(df_interpol)

#summary_stats
display(df_interpol.describe())
Intepolated data saved to 'input/data_weekly_B0S0T0D2S0O1I1N1P1.csv'
kb zos salinity water_temp wind_direction wind_speed peace_discharge peace_TN peace_TP
time
1993-01-04 333.0 0.012906 33.043478 26.800000 36.000000 13.900000 202.0 8.2000 1.999091
1993-01-11 667.0 0.015614 33.065217 27.000000 118.000000 16.200000 423.0 10.1000 1.934545
1993-01-18 667.0 0.021702 33.086957 27.100000 108.000000 16.200000 1470.0 12.0000 1.870000
1993-01-25 0.0 0.015950 33.108696 26.800000 110.000000 12.600000 1450.0 10.0475 1.870500
1993-02-01 0.0 0.008977 33.130435 26.500000 14.000000 17.500000 1490.0 8.0950 1.871000
... ... ... ... ... ... ... ... ... ...
2023-11-27 0.0 -0.005114 36.000000 27.110714 79.800000 11.666667 326.0 2.0675 0.836500
2023-12-04 0.0 -0.003981 36.000000 26.992857 91.466667 12.306667 231.0 2.1800 0.800000
2023-12-11 0.0 -0.009013 36.000000 26.528571 83.833333 12.086667 201.0 1.8400 0.790000
2023-12-18 0.0 0.005557 36.000000 26.207143 110.433333 12.176667 460.0 2.2080 0.970000
2023-12-25 0.0 0.001243 36.000000 26.053571 92.166667 13.396667 470.0 2.2080 0.970000

1617 rows × 9 columns

kb zos salinity water_temp wind_direction wind_speed peace_discharge peace_TN peace_TP
count 1.617000e+03 1617.000000 1617.000000 1617.000000 1617.000000 1617.000000 1617.000000 1617.000000 1617.000000
mean 1.438820e+06 0.001081 35.790494 28.261448 108.228236 11.646628 1367.712245 2.760305 1.376341
std 7.904052e+06 0.045056 2.634435 2.396945 73.549966 2.934444 2471.245893 2.473869 0.775845
min 0.000000e+00 -0.101714 16.000000 21.800000 0.000000 2.100000 9.100000 0.470000 0.040000
25% 0.000000e+00 -0.036704 34.492857 26.400000 66.000000 9.700000 198.000000 1.560000 0.858500
50% 3.330000e+02 -0.002629 36.000000 27.942308 97.892857 11.500000 606.000000 1.880500 1.125000
75% 1.370830e+05 0.038149 37.000000 30.400000 124.142857 13.189655 1560.000000 2.595000 1.688000
max 1.500000e+08 0.111739 79.780000 34.500000 360.000000 28.600000 50800.000000 20.000000 7.670000

5.3 Plot final data for ML#

# Assuming df_interpol is already defined
# df_interpol = pd.DataFrame({...})  # Replace with your actual DataFrame

# Set the index to ensure the time series is on the x-axis
df_interpol.index = pd.to_datetime(df_interpol.index)

# Prepare subplot parameters
variables = [
    {'key': 'kb', 'color': 'b', 'label': 'K. bervis (cells/l)'},
    {'key': 'zos', 'color': 'g', 'label': 'Detrend sea surfave height (m)'}, 
    {'key': 'salinity', 'color': 'r', 'label': 'Salinity (ppt)'},
    {'key': 'water_temp', 'color': 'm', 'label': 'Water Temperature (°C)'},
    {'key': 'wind_direction', 'color': 'c', 'label': 'Wind Direction (degrees)'},
    {'key': 'wind_speed', 'color': 'y', 'label': 'Wind Speed (m/s)'},
    {'key': 'peace_discharge', 'color': 'k', 'label': 'Peace River Discharge (cfs)'},
    {'key': 'peace_TN', 'color': 'orange', 'label': 'Peace River TN (mg/l)'},
    {'key': 'peace_TP', 'color': 'purple', 'label': 'Peace River TP (mg/l)'}
]

# Create a figure with subplots for each variable
fig, axs = plt.subplots(len(variables), 1, figsize=(12, 30), sharex=True)

# Function to plot each variable
def plot_variable(ax, key, color, label):
    ax.plot(df_interpol.index, df_interpol[key], color=color, label=key)
    ax.set_ylabel(label, fontsize=14)  # Increase y-label font size
    ax.legend(fontsize=12)              # Increase legend font size
    ax.grid(True)
    ax.tick_params(axis='both', labelsize=12)  # Increase tick label size

    # Optional: Limit y-axis ticks for readability
    ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=5))  # Fewer y-ticks

# Loop through each variable to create plots
for i, var in enumerate(variables):
    plot_variable(axs[i], var['key'], var['color'], var['label'])

# Set common title and x-axis label
plt.suptitle('Environmental Data Time Series', fontsize=16)  # Add main title
plt.xlabel('Date', fontsize=14)  # Increase x-label font size
plt.xticks(rotation=0, fontsize=12)  # Rotate x-axis labels and increase font size

# Save the figure
plt.savefig('figures/data.png', dpi=300, bbox_inches='tight')  # Save with high resolution and tight layout

# Adjust layout and display the plot
plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust layout
plt.show()
../_images/5cd710cb9d16e2d7b89bd70fa28256f27f44941547f665408d05361cabca9cbc.png
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df_interpol is already defined
# df_interpol = pd.DataFrame({...})  # Replace with your actual DataFrame

# Set the index to ensure the time series is on the x-axis
df_interpol.index = pd.to_datetime(df_interpol.index)

# Prepare subplot parameters
variables = [
    {'key': 'kb', 'color': 'b', 'label': 'K. brevis (cells/l)'},
    {'key': 'zos', 'color': 'g', 'label': 'Detrend sea surface heigh (m)'},
    {'key': 'salinity', 'color': 'r', 'label': 'Salinity (ppt)'},
    {'key': 'water_temp', 'color': 'm', 'label': 'Water Temperature (°C)'},
    {'key': 'wind_direction', 'color': 'c', 'label': 'Wind Direction (degrees)'},
    {'key': 'wind_speed', 'color': 'y', 'label': 'Wind Speed (m/s)'},
    {'key': 'peace_discharge', 'color': 'k', 'label': 'Peace River Discharge (cfs)'},
    {'key': 'peace_TN', 'color': 'orange', 'label': 'Peace River TN (mg/l)'},
    {'key': 'peace_TP', 'color': 'purple', 'label': 'Peace River TP (mg/l)'}
]

# Create a figure with subplots for each variable (5x2 layout)
fig, axs = plt.subplots(5, 2, figsize=(12, 15), sharex=True)  # Adjusted figsize for better visibility
axs = axs.flatten()  # Flatten the 2D array of axes for easy iteration

# Function to plot each variable
def plot_variable(ax, key, color, label):
    ax.plot(df_interpol.index, df_interpol[key], color=color, label=key)
    ax.set_ylabel(label, fontsize=12)  # Increased y-label font size
    ax.legend(fontsize=10)              # Increase legend font size
    ax.grid(True)
    ax.tick_params(axis='both', labelsize=10)  # Increase tick label size

    # Limit y-axis ticks for readability
    ax.yaxis.set_major_locator(plt.MaxNLocator(nbins=5))  # Fewer y-ticks

# Plot Karenia brevis in the first subplot (top position)
plot_variable(axs[0], 'kb', 'b', 'K. brevis (cells/l)')

# Leave the second subplot blank
axs[1].axis('off')  # Hide the second subplot

# Start plotting the driver variables from the third subplot (i.e., (5,2,2))
driver_vars = variables[1:]  # Get all driver variables except 'kb'

# Loop through the driver variables and assign them to the correct subplots
for i, var in enumerate(driver_vars):
    plot_variable(axs[i + 2], var['key'], var['color'], var['label'])  # Start from axs[2]

# Hide any unused subplots (if there are fewer than 10 variables)
for j in range(len(driver_vars) + 2, len(axs)):  # Start from 2 because 0 and 1 are used.
    fig.delaxes(axs[j])

# Set common title and x-axis label
plt.suptitle('Environmental Data Time Series', fontsize=16)  # Main title
plt.xlabel('Date', fontsize=12)  # Increase x-label font size
plt.xticks(rotation=0, fontsize=10)  # Rotate x-axis labels and increase font size

# Save the figure
plt.savefig('figures/data.png', dpi=300, bbox_inches='tight')  # Save with high resolution and tight layout

# Adjust layout and display the plot
plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust layout
plt.show()
../_images/ea6cfb4dab37c0eac9a760904b16bf8b3f56972d431fd51ece1e88c7dbc941ed.png