import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
from matplotlib import pyplot as plt
import holoviews as hv
import hvplot.pandas
# Load bokeh
"bokeh") hv.extension(
Wind Strength and Wave Height Correlation Analysis
Data Source:
1. Wind data comes from ERA5 climate data from Climate Data Store The wind data in this project comes from the ‘ERA5 monthly averaged data on single levels from 1940 to present’ dataset, specifically the 10m u-component of wind and 10m v-component of wind.
The time period will be 2022 yearly data, and the area of interest will be Lake Erie. 2. Wave data comes from the final dataframe of the wave visualization page.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
Step 1: Read the .nc file downloaded from the ERA5 website and change it to a dataframe
Need to use xarry to read netCDF file
= xr.open_dataset('./data/ERA5land.nc')
windnc = windnc.to_dataframe().reset_index() winddf
The raw data looks like this
winddf.head()
longitude | latitude | time | u10 | v10 | |
---|---|---|---|---|---|
0 | -80.0 | 43.400002 | 2022-01-01 | 1.490095 | -0.002929 |
1 | -80.0 | 43.400002 | 2022-02-01 | 1.628315 | 0.594098 |
2 | -80.0 | 43.400002 | 2022-03-01 | 1.543347 | -0.034178 |
3 | -80.0 | 43.400002 | 2022-04-01 | 1.104076 | 0.152090 |
4 | -80.0 | 43.400002 | 2022-05-01 | -0.122249 | 0.055608 |
winddf.dtypes
longitude float64
latitude float64
time datetime64[ns]
u10 float32
v10 float32
dtype: object
Step 2: Calculate wind strength and change to geodataframe
The wind strength is based on calculation of wind u and wind v
'wind_strength'] = np.sqrt(winddf['u10']**2 + winddf['v10']**2) winddf[
Currently the data is arranged by days, but we need month value. Manipulate the dataframe to get the month column
"month_int"] = winddf["time"].dt.month
winddf[= winddf.groupby(["month_int", "longitude", "latitude"])[["wind_strength"]].mean().reset_index() wind_month
Use the latitude and longitude column to get a geodataframe of wind data
"geometry"] = gpd.points_from_xy(
wind_month["longitude"], wind_month["latitude"]
wind_month[
)= gpd.GeoDataFrame(
wind_month_gdf ="geometry", crs="EPSG:4326"
wind_month, geometry )
The geodataframe looks like this
wind_month_gdf.head()
month_int | longitude | latitude | wind_strength | geometry | |
---|---|---|---|---|---|
0 | 1 | -80.0 | 42.000000 | 1.075989 | POINT (-80.00000 42.00000) |
1 | 1 | -80.0 | 42.099998 | 1.489239 | POINT (-80.00000 42.10000) |
2 | 1 | -80.0 | 42.200001 | 1.999220 | POINT (-80.00000 42.20000) |
3 | 1 | -80.0 | 42.299999 | 2.441840 | POINT (-80.00000 42.30000) |
4 | 1 | -80.0 | 42.400002 | 2.715811 | POINT (-80.00000 42.40000) |
Have a look at the location of each wind data’s point
="wind_strength",
wind_month_gdf.explore(column="CartoDB positron") tiles
Step 3: Read the wave dataframe that is cleaned in the wave data visualization page
= pd.read_csv("./data/wave2022.csv") wave
wave.head()
time | latitude | longitude | waveHs | datetime | month_int | month | day_int | station | |
---|---|---|---|---|---|---|---|---|---|
0 | 2022-01-01 00:00:00 | 42.32 | -79.88 | 0.132812 | 2022-01-01 00:00:00 | 1 | Jan | 1 | ST92023 |
1 | 2022-01-01 01:00:00 | 42.32 | -79.88 | 0.140625 | 2022-01-01 01:00:00 | 1 | Jan | 1 | ST92023 |
2 | 2022-01-01 02:00:00 | 42.32 | -79.88 | 0.140625 | 2022-01-01 02:00:00 | 1 | Jan | 1 | ST92023 |
3 | 2022-01-01 03:00:00 | 42.32 | -79.88 | 0.140625 | 2022-01-01 03:00:00 | 1 | Jan | 1 | ST92023 |
4 | 2022-01-01 04:00:00 | 42.32 | -79.88 | 0.132812 | 2022-01-01 04:00:00 | 1 | Jan | 1 | ST92023 |
This data is arranged by hours, need to group into months as well
= wave.groupby(["month_int", "station", "longitude", "latitude"])[["waveHs"]].mean().reset_index() wave_month
Change to a geodataframe
"geometry"] = gpd.points_from_xy(
wave_month["longitude"], wave_month["latitude"]
wave_month[
)= gpd.GeoDataFrame(
wave_month_gdf ="geometry", crs="EPSG:4326"
wave_month, geometry )
The final wave height geodataframe looks like this
wave_month_gdf.head()
month_int | station | longitude | latitude | waveHs | geometry | |
---|---|---|---|---|---|---|
0 | 1 | ST92001 | -79.04 | 42.76 | 1.102995 | POINT (-79.04000 42.76000) |
1 | 1 | ST92002 | -79.08 | 42.76 | 1.166224 | POINT (-79.08000 42.76000) |
2 | 1 | ST92003 | -79.12 | 42.72 | 1.220169 | POINT (-79.12000 42.72000) |
3 | 1 | ST92004 | -79.16 | 42.68 | 1.260833 | POINT (-79.16000 42.68000) |
4 | 1 | ST92005 | -79.20 | 42.64 | 1.242513 | POINT (-79.20000 42.64000) |
Step 4: Match wave and wind data
Firstly, the wind data need to form a “container” that can be used to capture wave data near each point. Here each wind point will form a 8km buffer
Note: Need to change to crs 3857 to make the buffer
= wind_month_gdf.copy().to_crs(epsg=3857)
buffered_wind_month_gdf "geometry"] = buffered_wind_month_gdf.buffer(8e3) buffered_wind_month_gdf[
Have a look at the buffer location. It covers all the area
="wind_strength",
buffered_wind_month_gdf.explore(column="CartoDB positron") tiles