This page was generated by nbsphinx from docs/examples/Caches.ipynb.
Interactive online version:

Speasy caches levels analysis

Only for Google Colab users:

[ ]:

%pip install --upgrade ipympl speasy

[ ]:

try:
    from google.colab import output
    output.enable_custom_widget_manager()
except:
    print("Not running inside Google Collab")

For all users:

[ ]:

import speasy as spz
%matplotlib widget
amda_tree = spz.inventories.tree.amda
# Use this instead if you are not using jupyterlab yet
#%matplotlib notebook
import matplotlib.pyplot as plt
from datetime import datetime

[1]:

import speasy as spz

amda_tree = spz.inventories.tree.amda
%matplotlib widget
# Use this instead if you are not using jupyterlab yet
#%matplotlib notebook
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import time
import numpy as np

First ensure that speasy is setup to use SciQLop cache

[2]:

spz.config.proxy.url.set('http://sciqlop.lpp.polytechnique.fr/cache-dev')
spz.config.proxy.enabled.set(True)

[3]:

start_time = datetime(2016, 6, 2)
stop_time = datetime(2016, 6, 8)
reference_data = spz.get_data(amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, progress=False)
print(f"Data shape: {reference_data.values.shape}")
print(f"Data size in Bytes: {reference_data.nbytes}")

Data shape: (32400, 3)
Data size in Bytes: 1039238

[4]:

def times(f, *args, n=10, **kwargs):
    def time_once():
        start = time.perf_counter_ns()
        f(*args, **kwargs, progress=False)
        stop = time.perf_counter_ns()
        return (stop - start) / 1e6

    return [time_once() for _ in range(n)]

def best_99_percent(times):
    return sorted(times)[:int(len(times)*.99)]

def best_90_percent(times):
    return sorted(times)[:int(len(times)*.9)]

Cache level comparison

Then request data several times with all 3 configurations:

without any cache, each time speasy will download data from AMDA
with remote cache only, each time speasy download data from our remote cache hosted here
with local cache, each time after the first request speasy will load data from your disk

[5]:

durations_without_any_cache = times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time,
                                    disable_cache=True, disable_proxy=True, n=10);
durations_with_remote_cache = times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time,
                                    disable_cache=True, n=1000);
durations_with_local_cache = times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time,
                                   n=10000);

[6]:

fig, axs = plt.subplots(3, 1, figsize=(6, 10))
for i, data, title in ((0, best_99_percent(durations_without_any_cache), 'Without any cache'),
                       (1, best_99_percent(durations_with_remote_cache), 'With only SciQLop remote cache'),
                       (2, best_99_percent(durations_with_local_cache), 'With local on disk cache')):
    axs[i].hist(data)
    axs[i].set_xlabel('Execution time (ms)')
    axs[i].set_title(title)

fig.suptitle('Execution time distributions for each conf', fontsize=16)
plt.tight_layout()
plt.show()

[7]:

fig, ax = plt.subplots()
ax.violinplot([best_99_percent(durations_without_any_cache), best_99_percent(durations_with_remote_cache), best_99_percent(durations_with_local_cache), ], showmeans=False,
              showmedians=True)
ax.set_xticks([1, 2, 3], labels=['without', 'remote', 'local'])
ax.set_ylabel('Execution time (ms)')
plt.semilogy()
fig.suptitle('Comparison plot', fontsize=16)
plt.tight_layout()
plt.show()

Scaling

On disk cache scaling

[8]:

start_time = datetime(2016, 6, 2)


def scaling_point(delta):
    stop_time = start_time + timedelta(hours=delta)
    data = spz.get_data(amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time)
    capacity = data.nbytes
    t = best_90_percent(times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, n=200))
    return capacity, t


deltas = np.logspace(np.log10(100), np.log10(5000), num=10)
values = [scaling_point(delta) for delta in deltas]


fig, ax = plt.subplots()
stats = [t for c, t in values]
capacities = np.array([c for c, t in values])
ax.violinplot(stats, positions=capacities, widths=np.gradient(capacities), showmeans=False, showmedians=True)
ax.set_ylabel('Execution time (ms)')
ax.set_xlabel('Loaded data size (Bytes)')
fig.suptitle('On disk cache scaling', fontsize=16)
plt.tight_layout()
plt.loglog()
plt.show()

SciQLop remote cache scaling

[9]:

start_time = datetime(2016, 6, 2)


def scaling_point(delta):
    stop_time = start_time + timedelta(hours=delta)
    data = spz.get_data(amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, disable_cache=True)
    capacity = data.nbytes
    t = best_90_percent(times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, disable_cache=True,
              n=20))
    return capacity, t


deltas = np.logspace(np.log10(100), np.log10(5000), num=10)
values = [scaling_point(delta) for delta in deltas]

fig, ax = plt.subplots()
stats = [t for c, t in values]
capacities = np.array([c for c, t in values])
ax.violinplot(stats, positions=capacities, widths=np.gradient(capacities), showmeans=False, showmedians=True)
ax.set_ylabel('Execution time (ms)')
ax.set_xlabel('Loaded data size (Bytes)')
fig.suptitle('SciQLop remote cache scaling', fontsize=16)
plt.tight_layout()
plt.loglog()
plt.show()