pyobsmod dataset#
Import required packages.
import matplotlib.pyplot as plt
import numpy as np
import pyobsmod as pom
Create dataset#
Create a dataset from your or some random data.
obs = np.random.normal(size=100)
mod = np.random.normal(size=100)
ds = pom.Dataset(obs, mod)
print(ds)
pyobsmod.Dataset(
obs mod
0 0.815622 0.422712
1 0.777127 0.210631
2 0.214772 1.410394
3 0.320680 1.172358
4 0.369763 0.303181
.. ... ...
95 0.759742 -1.055641
96 0.488496 -1.514556
97 0.383779 -0.436620
98 0.836241 0.638921
99 1.128827 -0.014982
[100 rows x 2 columns]
)
Or if you want to be set up quickly, use an example dataset.
from pyobsmod import load_dataset_example
ds = load_dataset_example()
print(ds)
pyobsmod.Dataset(
obs mod
2018-01-01 00:00:00 0.049671 -0.233403
2018-01-01 03:00:00 0.034964 -0.049165
2018-01-01 06:00:00 0.234695 0.166153
2018-01-01 09:00:00 0.450255 0.289799
2018-01-01 12:00:00 0.330685 0.298428
... ... ...
2018-01-12 21:00:00 -0.017711 0.059352
2018-01-13 00:00:00 0.292558 0.115787
2018-01-13 03:00:00 0.374932 0.405677
2018-01-13 06:00:00 0.318567 0.330209
2018-01-13 09:00:00 0.119279 -0.109315
[100 rows x 2 columns]
)
Manipulate dataset#
It is possible to access one time step or slice it in Python fashion.
print(ds[10])
print(ds[-10:])
pyobsmod.Dataset(
obs mod
2018-01-02 06:00:00 -0.845278 -1.229032
)
pyobsmod.Dataset(
obs mod
2018-01-12 06:00:00 0.335818 0.246515
2018-01-12 09:00:00 0.308820 0.480100
2018-01-12 12:00:00 0.010321 0.053139
2018-01-12 15:00:00 -0.028031 -0.277178
2018-01-12 18:00:00 -0.015112 0.019524
2018-01-12 21:00:00 -0.017711 0.059352
2018-01-13 00:00:00 0.292558 0.115787
2018-01-13 03:00:00 0.374932 0.405677
2018-01-13 06:00:00 0.318567 0.330209
2018-01-13 09:00:00 0.119279 -0.109315
)
Or get the length of the observations.
print(len(ds))
100
You can also access the data directly. Either one by one:
print(ds.obs)
[ 0.04967142 0.0349644 0.23469547 0.45025491 0.33068495 0.256278
0.22118487 -0.17248844 -0.6127289 -0.72372215 -0.84527774 -0.64263177
-0.18250208 0.07820026 0.52743791 0.90382771 0.8720994 0.76739082
0.22692176 -0.30134305 -0.41714954 -0.8110417 -0.78316554 -0.73637821
-0.33751113 0.0456923 0.14874143 0.39070086 0.24814237 0.15479755
-0.00151169 0.18575592 0.03835859 0.05019873 0.3691898 0.23159066
0.31490648 -0.10507636 -0.34826656 -0.51687368 -0.69016229 -0.78989033
-0.63798089 -0.28246495 0.07299837 0.59010267 0.89804571 1.08975801
0.80444581 0.18898771 -0.08032357 -0.56797206 -0.84330894 -0.73859512
-0.51771387 -0.22372662 -0.07894061 0.21556774 0.38385403 0.41517279
0.15008791 0.0506914 -0.10852399 -0.08815847 0.22343297 0.41086262
0.34469929 0.40716436 0.15356898 -0.24634721 -0.47021303 -0.59428003
-0.81632288 -0.49824806 -0.55892554 0.25402871 0.63126073 0.89569136
1.00118877 0.60341999 0.39011867 -0.02898252 -0.34560539 -0.81223428
-0.88830032 -0.69657952 -0.25891936 0.00736817 0.17468833 0.39818288
0.33581848 0.30881979 0.01032071 -0.0280307 -0.01511241 -0.01771126
0.29255801 0.3749318 0.31856747 0.11927875]
or all the data together (ds.data does work as well):
print(ds.values)
[[ 0.04967142 -0.23340273]
[ 0.0349644 -0.04916466]
[ 0.23469547 0.16615257]
[ 0.45025491 0.28979946]
[ 0.33068495 0.29842781]
[ 0.256278 0.33708817]
[ 0.22118487 0.59842205]
[-0.17248844 -0.13757287]
[-0.6127289 -0.56121883]
[-0.72372215 -0.73861133]
[-0.84527774 -1.22903198]
[-0.64263177 -0.64793454]
[-0.18250208 -0.17045603]
[ 0.07820026 0.57084868]
[ 0.52743791 0.48896571]
[ 0.90382771 0.96413718]
[ 0.8720994 0.86515705]
[ 0.76739082 0.53365521]
[ 0.22692176 0.45548633]
[-0.30134305 -0.15095644]
[-0.41714954 -0.25894315]
[-0.8110417 -0.99291919]
[-0.78316554 -0.50260668]
[-0.73637821 -1.01674842]
[-0.33751113 -0.22013971]
[ 0.0456923 0.48378342]
[ 0.14874143 -0.04936583]
[ 0.39070086 0.27744131]
[ 0.24814237 0.26807264]
[ 0.15479755 0.05410242]
[-0.00151169 -0.31164438]
[ 0.18575592 0.19946851]
[ 0.03835859 -0.17410215]
[ 0.05019873 0.14491722]
[ 0.3691898 0.18530495]
[ 0.23159066 0.54157754]
[ 0.31490648 0.15825582]
[-0.10507636 -0.16948867]
[-0.34826656 -0.18556312]
[-0.51687368 -0.76304655]
[-0.69016229 -0.6446703 ]
[-0.78989033 -0.52846178]
[-0.63798089 -0.95947754]
[-0.28246495 -0.24553818]
[ 0.07299837 0.12497493]
[ 0.59010267 0.74646725]
[ 0.89804571 0.65065557]
[ 1.08975801 0.82566669]
[ 0.80444581 0.90883413]
[ 0.18898771 0.24838465]
[-0.08032357 -0.030225 ]
[-0.56797206 -0.49868242]
[-0.84330894 -0.97931388]
[-0.73859512 -0.69214438]
[-0.51771387 -0.45909937]
[-0.22372662 -0.3665969 ]
[-0.07894061 0.29421429]
[ 0.21556774 0.31033432]
[ 0.38385403 0.14559333]
[ 0.41517279 0.54648351]
[ 0.15008791 -0.04484842]
[ 0.0506914 0.20810832]
[-0.10852399 0.12319512]
[-0.08815847 -0.25229494]
[ 0.22343297 0.4161082 ]
[ 0.41086262 0.49341881]
[ 0.34469929 0.50911132]
[ 0.40716436 0.78652296]
[ 0.15356898 0.10449136]
[-0.24634721 -0.39709444]
[-0.47021303 -0.64811591]
[-0.59428003 -0.75744208]
[-0.81632288 -0.83174322]
[-0.49824806 -0.43001767]
[-0.55892554 -0.50358738]
[ 0.25402871 0.41946536]
[ 0.63126073 0.63386111]
[ 0.89569136 1.18639817]
[ 1.00118877 0.9482574 ]
[ 0.60341999 1.14745382]
[ 0.39011867 0.51525214]
[-0.02898252 -0.20041403]
[-0.34560539 -0.55978389]
[-0.81223428 -0.7157398 ]
[-0.88830032 -0.93299287]
[-0.69657952 -0.55377942]
[-0.25891936 -0.16427184]
[ 0.00736817 -0.00719762]
[ 0.17468833 0.00532959]
[ 0.39818288 0.09521344]
[ 0.33581848 0.24651549]
[ 0.30881979 0.48009955]
[ 0.01032071 0.05313946]
[-0.0280307 -0.27717846]
[-0.01511241 0.01952377]
[-0.01771126 0.05935221]
[ 0.29255801 0.11578652]
[ 0.3749318 0.40567683]
[ 0.31856747 0.33020921]
[ 0.11927875 -0.10931531]]
You can add, subtract, multiply or divide the dataset with one or multiple values or do the same with another dataset.
For example, to add a constant bias you might do something like
print(ds + 2)
pyobsmod.Dataset(
obs mod
2018-01-01 00:00:00 2.049671 1.766597
2018-01-01 03:00:00 2.034964 1.950835
2018-01-01 06:00:00 2.234695 2.166153
2018-01-01 09:00:00 2.450255 2.289799
2018-01-01 12:00:00 2.330685 2.298428
... ... ...
2018-01-12 21:00:00 1.982289 2.059352
2018-01-13 00:00:00 2.292558 2.115787
2018-01-13 03:00:00 2.374932 2.405677
2018-01-13 06:00:00 2.318567 2.330209
2018-01-13 09:00:00 2.119279 1.890685
[100 rows x 2 columns]
)
Or to add two different biases
print(ds + [2, 3])
pyobsmod.Dataset(
obs mod
2018-01-01 00:00:00 2.049671 2.766597
2018-01-01 03:00:00 2.034964 2.950835
2018-01-01 06:00:00 2.234695 3.166153
2018-01-01 09:00:00 2.450255 3.289799
2018-01-01 12:00:00 2.330685 3.298428
... ... ...
2018-01-12 21:00:00 1.982289 3.059352
2018-01-13 00:00:00 2.292558 3.115787
2018-01-13 03:00:00 2.374932 3.405677
2018-01-13 06:00:00 2.318567 3.330209
2018-01-13 09:00:00 2.119279 2.890685
[100 rows x 2 columns]
)
Adding two datasets is as easy as
print(ds + ds)
pyobsmod.Dataset(
obs mod
2018-01-01 00:00:00 0.099343 -0.466805
2018-01-01 03:00:00 0.069929 -0.098329
2018-01-01 06:00:00 0.469391 0.332305
2018-01-01 09:00:00 0.900510 0.579599
2018-01-01 12:00:00 0.661370 0.596856
... ... ...
2018-01-12 21:00:00 -0.035423 0.118704
2018-01-13 00:00:00 0.585116 0.231573
2018-01-13 03:00:00 0.749864 0.811354
2018-01-13 06:00:00 0.637135 0.660418
2018-01-13 09:00:00 0.238557 -0.218631
[100 rows x 2 columns]
)
Internally, all the operations are deferred to the underlying DataFrame. This means that an operation with two datasets will be performed element-wise. For even more control you can also access the DataFrame directly.
print(ds.df)
obs mod
2018-01-01 00:00:00 0.049671 -0.233403
2018-01-01 03:00:00 0.034964 -0.049165
2018-01-01 06:00:00 0.234695 0.166153
2018-01-01 09:00:00 0.450255 0.289799
2018-01-01 12:00:00 0.330685 0.298428
... ... ...
2018-01-12 21:00:00 -0.017711 0.059352
2018-01-13 00:00:00 0.292558 0.115787
2018-01-13 03:00:00 0.374932 0.405677
2018-01-13 06:00:00 0.318567 0.330209
2018-01-13 09:00:00 0.119279 -0.109315
[100 rows x 2 columns]
If you want to perform actions on the DataFrame itself, for example, taking the mean of each column.
print(ds.df.mean())
obs 0.001861
mod 0.006322
dtype: float64
Or adding a new column.
ds.df['new_column'] = np.random.random(100)
print(ds)
pyobsmod.Dataset(
obs mod new_column
2018-01-01 00:00:00 0.049671 -0.233403 0.877373
2018-01-01 03:00:00 0.034964 -0.049165 0.740769
2018-01-01 06:00:00 0.234695 0.166153 0.697016
2018-01-01 09:00:00 0.450255 0.289799 0.702484
2018-01-01 12:00:00 0.330685 0.298428 0.359491
... ... ... ...
2018-01-12 21:00:00 -0.017711 0.059352 0.575474
2018-01-13 00:00:00 0.292558 0.115787 0.388170
2018-01-13 03:00:00 0.374932 0.405677 0.643288
2018-01-13 06:00:00 0.318567 0.330209 0.458253
2018-01-13 09:00:00 0.119279 -0.109315 0.545617
[100 rows x 3 columns]
)
Statistic mehods#
Try out some of the statistic methods.
print(ds.compute_stats(['bias', 'rmse']))
bias 0.004461
rmse 0.189830
dtype: float64
It is also possible to pass arguments for the individual statistics. In that case you have to pass a dictionary with the statistic as a key and the arguments as another dictionary. For statistics without arguments, you can simply pass an empty dictionary.
print(ds.compute_stats({'rmse': {}, 'r': {'method': 'spearman'}}))
rmse 0.189830
r 0.927009
dtype: float64
Get a quick description of your dataset
print(ds.describe_dataset())
bias 0.004461
rmse 0.189830
nrmse 0.095968
r2 0.844887
dtype: float64
Get a table plot of the statistic methods.
ds.stats_plot(["bias", "rmse"])
plt.show()
Plotting methods#
Or plot your data with matplotlib or seaborn.
grid = ds.scatter_plot_sns(['bias', 'rmse', 'nrmse', 'r2'])
fig = grid.fig
ax = grid.ax_joint
plt.show()
It is also possible to use the plots only as a subplot. Create a figure and at least one axes instance and simply pass the axis to the method!
fig, axs = plt.subplots(3, 1, figsize=(20, 10))
# First subplot with pyobsmod
ds.time_series_plot(ax=axs[0])
# Second subplot with pyobsmod
ds.scatter_plot(['bias', 'rmse', 'nrmse', 'r2'], ax=axs[1])
# Third subplot whatever you want to plot
axs[2].plot(obs, obs)
plt.show()
Note however, that this does NOT work with the Dataset.scatter_plot_sns
function due to the way how seaborn handles this particular plot.
Save/Load dataset#
Once you are done with working on your data, you can save it.
ds.save("my_dataset")
The next time you can load your data simply with:
ds = pom.load_dataset("my_dataset")
Working without a dataset#
Sometimes creating an instance of a dataset is a bit of a hassle, especially
if you only want to compute some statistics quickly. In that case you can
use the standalone functions of the pyobsmod.metrics module. Here are
some examples:
import pyobsmod.metrics as pym
obs, mod = ds.obs, ds.mod
print(pym.rmse(obs, mod))
print(pym.r(obs, mod, method="spearman"))
print(pym.compute_stats(obs, mod, ["bias", "rmse"]))
print(pym.describe_dataset(obs, mod))
0.18983015004931222
0.927008700870087
bias 0.004461
rmse 0.189830
dtype: float64
bias 0.004461
rmse 0.189830
nrmse 0.095968
r2 0.844887
dtype: float64