Table Of Contents
Table Of Contents

[Download]

Synthetic Data Generation Tutorial

In [1]:
import json
from itertools import islice
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import (AutoMinorLocator, MultipleLocator)
In [2]:
from gluonts.dataset.artificial import recipe as rcp
from gluonts.core.serde import dump_code, load_code
In [3]:
# plotting utils

def plot_recipe(recipe, length):
    output_dict = rcp.evaluate(recipe, length)
    K = len(output_dict)
    lct = MultipleLocator(288)
    minor = AutoMinorLocator(12)

    fig, axs = plt.subplots(K, 1, figsize=(16, 2 * len(recipe)))
    for i, k in enumerate(output_dict):
        axs[i].xaxis.set_major_locator(lct)
        axs[i].xaxis.set_minor_locator(minor)
        axs[i].plot(output_dict[k])
        axs[i].grid()
        axs[i].set_ylabel(k)


def plot_examples(recipe, target, length, num, anomaly_indicator=None):
    fix, axs = plt.subplots(num, 1, figsize=(16, num * 2))
    for i in range(num):
        xx = rcp.evaluate(recipe, length)
        axs[i].plot(xx[target])
        axs[i].set_ylim(0, 1.1*np.max(xx[target]))
        axs[i].grid()
        if anomaly_indicator is not None:
            axs[i].fill_between(
                np.arange(len(xx[target])),
                xx[anomaly_indicator] * 1.1*np.max(xx[target]),
                np.zeros(len(xx[target])),
                alpha=0.3,
                color="red")


def print_dicts(*dicts):
    for d in dicts:
        print("{")
        for k,v in d.items():
            print("\t", k, ": ", v)
        print("}\n")

Recipes

Recipes are lists of (name, expression) tuples. The role of a recipe is to describe the generative process of a single time series. In order to do so, the expressions in the (name, expression) pairs are evaluated for each time series in the order given in the list to produce a {name: value} dictionary as output.

In [4]:
recipe = [
    ("myOutput1", 1.0),
    ("myOutput2", 42)
]

rcp.evaluate(recipe, length=5)
Out[4]:
{'myOutput1': 1.0, 'myOutput2': 42}

Expressions

Each expression can either be a Python value, a string (interpreted as a reference to previously defined name), or a special type of Callable, that is evaluated each time the recipe is evaluated.

In [5]:
recipe = [
    ("myOutput1", 1.0),
    ("myOutput2", "myOutput1")  # reference to previously defined name
]

rcp.evaluate(recipe, length=5)
Out[5]:
{'myOutput1': 1.0, 'myOutput2': 1.0}
In [6]:
recipe = [
    ("myOutput1", rcp.RandomGaussian()),  # callable as expression
]

# multiple evaluations lead to different results, due to randomness
print_dicts(
    rcp.evaluate(recipe, length=5),
    rcp.evaluate(recipe, length=5),
)
{
         myOutput1 :  [-1.69674452 -0.42197157  1.38592954 -1.13811894 -1.06239691]
}

{
         myOutput1 :  [-0.4850381  -0.65920443  0.68951118  1.36561071 -0.71162324]
}

Expressions with References

In [7]:
recipe = [
    ("stddev1", 2.0),
    ("stddev2", rcp.RandomUniform(low=0, high=1, shape=(1, ))),
    ("myOutput1", rcp.RandomGaussian(stddev="stddev1")),
    ("myOutput2", rcp.RandomGaussian(stddev="stddev2"))
]

# multiple evaluations lead to different results, due to randomness
print_dicts(
    rcp.evaluate(recipe, length=5),
    rcp.evaluate(recipe, length=5)
)
{
         stddev1 :  2.0
         stddev2 :  [0.38008483]
         myOutput1 :  [-0.17208844  3.87596545 -0.3283231  -2.51976359  3.72707279]
         myOutput2 :  [-0.48293124  0.20628998  0.04645834 -0.59160444 -0.11661102]
}

{
         stddev1 :  2.0
         stddev2 :  [0.33249927]
         myOutput1 :  [-0.45888149 -2.41575306 -2.32953018  0.61609625 -1.48939901]
         myOutput2 :  [-0.33175128  0.28107296  0.32919377  0.23818991 -0.15756851]
}

In [8]:
recipe = [
    ("random_out", rcp.RandomGaussian(shape=(1,))),
    ("fixed_out", np.random.randn(1))
]

# note that fixed_out stays the same;
# it's evaluated only once when the recipe is created
print_dicts(
    rcp.evaluate(recipe, length=1),
    rcp.evaluate(recipe, length=1)
)
{
         random_out :  [0.63621815]
         fixed_out :  [0.09259595]
}

{
         random_out :  [0.98416592]
         fixed_out :  [0.09259595]
}

Length

Most operators in the recipe package have a length argument that is automatically passed when the expression is evaluated. The idea is that these recipes are used to generate fixed-length time series, and most operators produce individual components of the time series that have the same length.

In [9]:
recipe = [
    ("random_gaussian", rcp.RandomGaussian()),
    ("constant_vec", rcp.ConstantVec(42))
]

print_dicts(
    rcp.evaluate(recipe, length=3),
    rcp.evaluate(recipe, length=5)
)
{
         random_gaussian :  [ 0.80892307 -0.48501767 -0.09222491]
         constant_vec :  [42. 42. 42.]
}

{
         random_gaussian :  [-0.91181503 -0.76814061 -1.14260113 -0.64974641  0.57390245]
         constant_vec :  [42. 42. 42. 42. 42.]
}

Operator Overloading

The Callable operators defined in the recipe package overload the basic arithmetic operations (addition, subtraction, multiplication, division).

In [10]:
recipe = [
    ("x1", 42 * rcp.ConstantVec(1)),
    ("x2", "x1" * rcp.RandomUniform()),
    ("x3", rcp.RandomGaussian() + rcp.RandomUniform()),
    ("x4", rcp.Ref("x1") + "x2" + "x3")
]

rcp.evaluate(recipe, 3)
Out[10]:
{'x1': array([42., 42., 42.]),
 'x2': array([40.85152138, 31.15185218, 41.71600366]),
 'x3': array([ 1.74629618, -0.36399104,  0.23023716]),
 'x4': array([84.59781756, 72.78786114, 83.94624082])}

SerDe

Recipes composed of serializable / representable components can easily be serialized / deserialized.

In [11]:
dumped = dump_code(recipe)
print(dumped)

reconstructed = load_code(dumped)

rcp.evaluate(reconstructed, 3)
[["x1", gluonts.dataset.artificial.recipe.LiftedMul(left=42, right=gluonts.dataset.artificial.recipe.ConstantVec(constant=1))], ["x2", gluonts.dataset.artificial.recipe.LiftedMul(left="x1", right=gluonts.dataset.artificial.recipe.RandomUniform(high=1.0, low=0.0, shape=[0]))], ["x3", gluonts.dataset.artificial.recipe.LiftedAdd(left=gluonts.dataset.artificial.recipe.RandomGaussian(shape=[0], stddev=1.0), right=gluonts.dataset.artificial.recipe.RandomUniform(high=1.0, low=0.0, shape=[0]))], ["x4", gluonts.dataset.artificial.recipe.LiftedAdd(left=gluonts.dataset.artificial.recipe.LiftedAdd(left=gluonts.dataset.artificial.recipe.Ref(field_name="x1"), right="x2"), right="x3")]]
Out[11]:
{'x1': array([42., 42., 42.]),
 'x2': array([18.90442409, 23.75295266, 14.56077472]),
 'x3': array([ 0.10218595, -1.04778157,  3.05428979]),
 'x4': array([61.00661004, 64.70517109, 59.61506451])}

Simple Examples

In [12]:
recipe = [
    ("daily_smooth_seasonality", rcp.SmoothSeasonality(period=288, phase=-72)),
    ("noise", rcp.RandomGaussian(stddev=0.1)),
    ("signal", rcp.Add(["daily_smooth_seasonality", "noise"]))
]

plot_recipe(recipe, 3 * 288)

../../_images/examples_synthetic_data_generation_tutorial_tutorial_19_0.png
In [13]:
recipe = [
    ("slope", rcp.RandomUniform(low=0, high=3, shape=(1,))),
    ("trend", rcp.LinearTrend(slope="slope")),
    ("daily_smooth_seasonality", rcp.SmoothSeasonality(period=288, phase=-72)),
    ("noise", rcp.RandomGaussian(stddev=0.1)),
    ("signal", rcp.Add(["trend", "daily_smooth_seasonality", "noise"]))
]

plot_examples(recipe, "signal", 3 * 288, 5)
../../_images/examples_synthetic_data_generation_tutorial_tutorial_20_0.png
In [14]:
weekly_seasonal_unscaled = [
    ('daily_smooth_seasonality', rcp.SmoothSeasonality(period=288, phase=-72)),
    ('weekday_scale', rcp.RandomUniform(0.1, 10, shape=(1,))),
    ('weekly_pattern', rcp.NormalizeMax(rcp.Concatenate([rcp.Ref("weekday_scale") * np.ones(5), np.ones(2)]))),
    ('day_of_week', rcp.Dilated(rcp.Repeated('weekly_pattern'), 288)),
    ('level', rcp.RandomUniform(low=0, high=10, shape=1)),
    ('noise_level', rcp.RandomUniform(low=0.01, high=1, shape=1)),
    ('noise', 'noise_level' * rcp.RandomGaussian()),
    ('signal', rcp.Mul(['daily_smooth_seasonality','day_of_week'])),
    ('unscaled', rcp.Add(['level', 'signal', 'noise']))
]

plot_recipe(weekly_seasonal_unscaled, 10 * 288)
plot_examples(weekly_seasonal_unscaled, "unscaled", 10 * 288, 5)
../../_images/examples_synthetic_data_generation_tutorial_tutorial_21_0.png
../../_images/examples_synthetic_data_generation_tutorial_tutorial_21_1.png

Composing Recipes

As recipes are just lists of expressions that evaluated sequentially, recipes can simply be composed from smaller component recipes by concatenating the corresponding lists. It is also possible to include the output of one recipe inside another one using the EvalRecipe operator.

In [15]:
scaling = [
    ("scale", rcp.RandomUniform(0, 1000)),
    ("z", "scale" * rcp.Ref("unscaled"))
]

weekly_seasonal = weekly_seasonal_unscaled + scaling

plot_examples(weekly_seasonal, "z", 10 * 288, 5)
../../_images/examples_synthetic_data_generation_tutorial_tutorial_23_0.png
In [16]:
weekly_seasonality = [
    ('daily_pattern', rcp.RandomUniform(0, 1, shape=(24,))),
    ('daily_seasonality', rcp.Dilated(rcp.Repeated("daily_pattern"), 12)),
    ('weekly_pattern', rcp.RandomUniform(0, 1, shape=(7,))),
    ('weekly_seasonality', rcp.Dilated(rcp.Repeated("weekly_pattern"), 288)),
    ('unnormalized_seasonality', rcp.Mul(['daily_seasonality', 'weekly_seasonality'])),
    ('seasonality', rcp.NormalizeMax("unnormalized_seasonality")),
]

gaussian_noise_low = [
    ('noise_level', rcp.RandomUniform(low=0.01, high=0.1, shape=1)),
    ('noise', rcp.Ref('noise_level') * rcp.RandomGaussian()),
]

complex_weekly_seasonal = (
      weekly_seasonality
    + [
        ('level', rcp.RandomUniform(low=0, high=10, shape=1)),
        ('signal', rcp.Add(['level', 'seasonality']))
    ]
    + gaussian_noise_low
    + [("unscaled", rcp.Add(["signal", "noise"]))]
    + scaling
)

plot_examples(complex_weekly_seasonal, "z", 10 * 288, 5)
../../_images/examples_synthetic_data_generation_tutorial_tutorial_24_0.png

Generating Anomalies

Anomalies are just another effect added/multiplied to a base time series. We can define a recipe for creating certain types of anomalies, and then compose it with a base recipe.

In [17]:
constant_recipe = [
    ("z", rcp.ConstantVec(1.0))
]

bmc_scale_anomalies = [
    ('normal_indicator', rcp.BinaryMarkovChain(one_to_zero=1/(288*7), zero_to_one=0.1)),
    ('anomaly_indicator', rcp.OneMinus('normal_indicator')),
    ('anomaly_scale', 0.5 + rcp.RandomUniform(-1.0, 1.0, shape=1)),
    ('anomaly_multiplier', 1 + rcp.Ref('anomaly_scale') * rcp.Ref('anomaly_indicator')),
    ('target', rcp.Mul(['z', 'anomaly_multiplier']))
]

plot_examples(constant_recipe + bmc_scale_anomalies, "target", 10*288, 5, "anomaly_indicator")
../../_images/examples_synthetic_data_generation_tutorial_tutorial_26_0.png
In [18]:
plot_examples(weekly_seasonal + bmc_scale_anomalies, 'target', 288*7, 5, "anomaly_indicator")
../../_images/examples_synthetic_data_generation_tutorial_tutorial_27_0.png

Generating Changepoints

In [19]:
homoskedastic_gaussian_noise = [
    ('level', rcp.RandomUniform(0, 10, shape=1)),
    ('noise_level', rcp.RandomUniform(0.01, 1, shape=1)),
    ('noise', rcp.RandomGaussian("noise_level")),
    ('unscaled', rcp.Add(['level', 'noise'])),
]
In [20]:
changepoint_noise_to_seasonal = [
    ('z_1', rcp.EvalRecipe(homoskedastic_gaussian_noise, "unscaled")),
    ('z_2', rcp.EvalRecipe(weekly_seasonal_unscaled, "unscaled")),
    ('z_stacked', rcp.StackPrefix('z')),
    ('change', rcp.RandomChangepoints(1)),
    ('unscaled', rcp.Choose("z_stacked", "change"))
]

changepoint_noise_to_seasonal_scaled = changepoint_noise_to_seasonal + scaling
In [21]:
plot_examples(changepoint_noise_to_seasonal_scaled + bmc_scale_anomalies, 'target', 288*7, 10, "anomaly_indicator")
../../_images/examples_synthetic_data_generation_tutorial_tutorial_31_0.png

Generating several time series

In [22]:
rcp.take_as_list(rcp.generate(10, weekly_seasonal_unscaled, "2018-01-01", {}), 2)
Out[22]:
[{'daily_smooth_seasonality': array([0.        , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
         0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
  'weekday_scale': array([5.53325369]),
  'weekly_pattern': array([1.        , 1.        , 1.        , 1.        , 1.        ,
         0.18072549, 0.18072549]),
  'day_of_week': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
  'level': array([7.15189366]),
  'noise_level': array([0.60673574]),
  'noise': array([-1.37627579,  0.80910965, -0.5113108 ,  1.19522357,  0.76819937,
         -0.30693338,  1.54426428,  0.65576722,  0.29384949,  0.35138523]),
  'signal': array([0.        , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
         0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
  'unscaled': array([5.77561787, 7.9611223 , 6.64105875, 8.34818777, 7.92199568,
         6.84793212, 8.70043552, 7.81348013, 7.45333928, 7.51288625]),
  'item_id': '0',
  'start': '2018-01-01'},
 {'daily_smooth_seasonality': array([0.        , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
         0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
  'weekday_scale': array([8.71312027]),
  'weekly_pattern': array([1.        , 1.        , 1.        , 1.        , 1.        ,
         0.11476945, 0.11476945]),
  'day_of_week': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
  'level': array([9.78618342]),
  'noise_level': array([0.80116698]),
  'noise': array([ 1.19700682, -0.16436603,  0.2508195 , -0.6842733 , -2.04537114,
          0.52365764,  0.69255774, -0.59459811,  1.81845245, -1.16518975]),
  'signal': array([0.        , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
         0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
  'unscaled': array([10.98319024,  9.62193638, 10.03747882,  9.10298066,  7.74271494,
         10.31281289, 10.48301873,  9.19740456, 11.612232  ,  8.63060103]),
  'item_id': '1',
  'start': '2018-01-01'}]

Saving to a file

In [23]:
def write_to_file(recipe, length, num_ts, fields, fn):
    with open(fn, 'w') as f, open(fn+"-all", 'w') as g:
        for x in islice(rcp.generate(length, recipe, "2019-01-07 00:00"), num_ts):
            z = {}
            for k in x:
                if type(x[k]) == np.ndarray:
                    z[k] = x[k].tolist()
                else:
                    z[k] = x[k]
            xx = {}
            for fi in fields:
                xx[fi] = z[fi]
            try:
                f.write(json.dumps(xx))
            except Exception as e:
                print(xx)
                print(z)
                raise e
            f.write('\n')
            g.write(json.dumps(z))
            g.write('\n')