In [1]:
import os
import pandas as pd
import numpy as np
pickle_path = os.path.join("data", "su_smhi_daily_full.bz2")
sthlm_data = pd.read_pickle(pickle_path)
sthlm_array = np.array(sthlm_data)
In [2]:
sthlm_data.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 154823 entries, 1756-01-01 to 2020-02-01
Data columns (total 1 columns):
temperature    154823 non-null float64
dtypes: float64(1)
memory usage: 2.4 MB
In [3]:
sthlm_data.describe()
Out[3]:
temperature
count 154823.000000
mean 6.075864
std 8.214404
min -27.700000
25% 0.100000
50% 5.700000
75% 13.100000
max 28.300000
In [4]:
print(sthlm_data.iloc[np.argmin(sthlm_array)])
print(sthlm_data.iloc[np.argmax(sthlm_array)])
temperature   -27.7
Name: 1760-01-07 00:00:00, dtype: float64
temperature    28.3
Name: 1975-08-07 00:00:00, dtype: float64

Plots

In [5]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

January data

In [74]:
jan_this_year = sthlm_data["2020-01"]
jan_this_year
Out[74]:
temperature
2020-01-01 3.5
2020-01-02 5.8
2020-01-03 5.4
2020-01-04 2.4
2020-01-05 0.1
2020-01-06 4.8
2020-01-07 5.5
2020-01-08 6.5
2020-01-09 3.9
2020-01-10 1.0
2020-01-11 1.2
2020-01-12 5.9
2020-01-13 2.6
2020-01-14 3.8
2020-01-15 7.8
2020-01-16 6.1
2020-01-17 4.8
2020-01-18 4.8
2020-01-19 3.1
2020-01-20 3.8
2020-01-21 6.8
2020-01-22 3.1
2020-01-23 1.5
2020-01-24 5.1
2020-01-25 2.1
2020-01-26 5.8
2020-01-27 4.8
2020-01-28 3.4
2020-01-29 3.3
2020-01-30 2.4
2020-01-31 3.0
In [75]:
jan_this_year.mean()
Out[75]:
temperature    4.003226
dtype: float64
In [6]:
month_data = sthlm_data.resample("M").mean()
month_jan_filter = (month_data.index.month == 1)
jan_data = month_data[month_jan_filter]
jan_array = jan_data.iloc[:,:].values[:,0]
In [7]:
jan_data.describe()
Out[7]:
temperature
count 265.000000
mean -3.366214
std 3.001257
min -14.283871
25% -5.048387
50% -3.275806
75% -1.459677
max 4.003226
In [8]:
jan_min_obs = jan_data.iloc[np.argmin(jan_array)]
jan_min_date = str(jan_min_obs.name).split(' ')[0].split('-')
jan_min = jan_min_obs[0]
print(jan_min_date[0]+':', jan_min)
1814: -14.283870967741942
In [9]:
jan_max_obs = jan_data.iloc[np.argmax(jan_array)]
jan_max_date = str(jan_max_obs.name).split(' ')[0].split('-')
jan_max = jan_max_obs[0]
print(jan_max_date[0]+':', jan_max)
2020: 4.003225806451612
In [12]:
plt.figure(figsize=(20,10))
plt.title("Dygnsmedeltemperatur i Januari", fontsize=18)
plt.ylabel("Temperatur (°C)", fontsize=18)
plt.xlabel("År", fontsize=18)
plt.plot(jan_data)
plt.show()
In [13]:
years = np.array(jan_data.index.year) 
plt.figure(figsize=(20,10))
sns.regplot(x=years, y=jan_array)
plt.show()
In [14]:
plt.figure(figsize=(20,10))
sns.boxplot(jan_array)
plt.show()
In [15]:
plt.figure(figsize=(20,10))
sns.distplot(jan_array)
plt.show()
In [19]:
from scipy.stats import t
from scipy.stats import sem

# Mean interval
t.interval(0.95, len(jan_array)-1, loc=np.mean(jan_array), scale=sem(jan_array))
Out[19]:
(-3.7292286908651118, -3.003199793614499)
In [39]:
from scipy.stats import ks_2samp

ks_2samp(jan_array[:100], jan_array[-100:])
Out[39]:
Ks_2sampResult(statistic=0.33, pvalue=3.211428734211389e-05)
In [67]:
for date, value in zip(jan_data.index.date, jan_data["temperature"].values):
    if value > 2:
        print(str(date)+":", value)
1796-01-31: 2.4000000000000004
1989-01-31: 2.795161290322581
2020-01-31: 4.003225806451612

Daily data

In [43]:
day_filter = (sthlm_data.index.month == 1)
jan_day_data = sthlm_data[day_filter]
jan_day_array = jan_day_data.iloc[:,:].values[:,0]
In [44]:
ks_2samp(jan_day_array[:100*31], jan_array[-100*31:])
Out[44]:
Ks_2sampResult(statistic=0.18344491783323189, pvalue=1.1593386028962982e-07)
In [62]:
years = np.array(jan_day_data.index.year) 
plt.figure(figsize=(20,10))
sns.regplot(x=years, y=jan_day_array)
plt.show()

Days above 7 °C

In [63]:
for date, value in zip(jan_day_data.index.date, jan_day_data["temperature"].values):
    if value > 6.5:
        print(str(date)+":", value)
1790-01-13: 7.7
1796-01-22: 6.9
1898-01-19: 8.3
1932-01-19: 8.4
1957-01-09: 6.7
1992-01-03: 7.6
2005-01-07: 7.0
1898-01-19: 8.8
1932-01-19: 8.9
1957-01-09: 7.4
1983-01-06: 7.1
1983-01-12: 7.1
1989-01-15: 6.6
1989-01-16: 6.6
1989-01-30: 6.8
1992-01-03: 8.3
1998-01-11: 6.8
2005-01-07: 7.7
2005-01-08: 6.8
2007-01-10: 6.7
2020-01-15: 7.8
2020-01-21: 6.8
In [50]:
 
Out[50]:
array([-8.7, -9.2, -8.6, ...,  3.3,  2.4,  3. ])
In [ ]: