This page was generated by nbsphinx from docs/examples/Caches.ipynb.
Interactive online version: Binder badge Google Colab badge

Speasy caches levels analysis

Only for Google Colab users:

[ ]:
%pip install --upgrade ipympl speasy
[ ]:
try:
    from google.colab import output
    output.enable_custom_widget_manager()
except:
    print("Not running inside Google Collab")

For all users:

[ ]:
import speasy as spz
%matplotlib widget
amda_tree = spz.inventories.tree.amda
# Use this instead if you are not using jupyterlab yet
#%matplotlib notebook
import matplotlib.pyplot as plt
from datetime import datetime

[1]:
import speasy as spz

amda_tree = spz.inventories.tree.amda
%matplotlib widget
# Use this instead if you are not using jupyterlab yet
#%matplotlib notebook
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import time
import numpy as np

First ensure that speasy is setup to use SciQLop cache

[2]:
spz.config.proxy.url.set('http://sciqlop.lpp.polytechnique.fr/cache-dev')
spz.config.proxy.enabled.set(True)
[3]:
start_time = datetime(2016, 6, 2)
stop_time = datetime(2016, 6, 8)
reference_data = spz.get_data(amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, progress=False)
print(f"Data shape: {reference_data.values.shape}")
print(f"Data size in Bytes: {reference_data.nbytes}")
Data shape: (32400, 3)
Data size in Bytes: 1039238
[4]:
def times(f, *args, n=10, **kwargs):
    def time_once():
        start = time.perf_counter_ns()
        f(*args, **kwargs, progress=False)
        stop = time.perf_counter_ns()
        return (stop - start) / 1e6

    return [time_once() for _ in range(n)]

def best_99_percent(times):
    return sorted(times)[:int(len(times)*.99)]

def best_90_percent(times):
    return sorted(times)[:int(len(times)*.9)]

Cache level comparison

Then request data several times with all 3 configurations:

  • without any cache, each time speasy will download data from AMDA

  • with remote cache only, each time speasy download data from our remote cahe hosted here

  • with local cahe, each time after the first request speasy will load data from your disk

[5]:
durations_without_any_cache = times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time,
                                    disable_cache=True, disable_proxy=True, n=10);
durations_with_remote_cache = times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time,
                                    disable_cache=True, n=1000);
durations_with_local_cache = times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time,
                                   n=10000);
[6]:
fig, axs = plt.subplots(3, 1, figsize=(6, 10))
for i, data, title in ((0, best_99_percent(durations_without_any_cache), 'Without any cache'),
                       (1, best_99_percent(durations_with_remote_cache), 'With only SciQLop remote cache'),
                       (2, best_99_percent(durations_with_local_cache), 'With local on disk cache')):
    axs[i].hist(data)
    axs[i].set_xlabel('Execution time (ms)')
    axs[i].set_title(title)

fig.suptitle('Execution time distributions for each conf', fontsize=16)
plt.tight_layout()
plt.show()
[7]:
fig, ax = plt.subplots()
ax.violinplot([best_99_percent(durations_without_any_cache), best_99_percent(durations_with_remote_cache), best_99_percent(durations_with_local_cache), ], showmeans=False,
              showmedians=True)
ax.set_xticks([1, 2, 3], labels=['without', 'remote', 'local'])
ax.set_ylabel('Execution time (ms)')
plt.semilogy()
fig.suptitle('Comparison plot', fontsize=16)
plt.tight_layout()
plt.show()

Scaling

On disk cache scaling

[8]:
start_time = datetime(2016, 6, 2)


def scaling_point(delta):
    stop_time = start_time + timedelta(hours=delta)
    data = spz.get_data(amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time)
    capacity = data.nbytes
    t = best_90_percent(times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, n=200))
    return capacity, t


deltas = np.logspace(np.log10(100), np.log10(5000), num=10)
values = [scaling_point(delta) for delta in deltas]


fig, ax = plt.subplots()
stats = [t for c, t in values]
capacities = np.array([c for c, t in values])
ax.violinplot(stats, positions=capacities, widths=np.gradient(capacities), showmeans=False, showmedians=True)
ax.set_ylabel('Execution time (ms)')
ax.set_xlabel('Loaded data size (Bytes)')
fig.suptitle('On disk cache scaling', fontsize=16)
plt.tight_layout()
plt.loglog()
plt.show()

SciQLop remote cache scaling

[9]:
start_time = datetime(2016, 6, 2)


def scaling_point(delta):
    stop_time = start_time + timedelta(hours=delta)
    data = spz.get_data(amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, disable_cache=True)
    capacity = data.nbytes
    t = best_90_percent(times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, disable_cache=True,
              n=20))
    return capacity, t


deltas = np.logspace(np.log10(100), np.log10(5000), num=10)
values = [scaling_point(delta) for delta in deltas]

fig, ax = plt.subplots()
stats = [t for c, t in values]
capacities = np.array([c for c, t in values])
ax.violinplot(stats, positions=capacities, widths=np.gradient(capacities), showmeans=False, showmedians=True)
ax.set_ylabel('Execution time (ms)')
ax.set_xlabel('Loaded data size (Bytes)')
fig.suptitle('SciQLop remote cache scaling', fontsize=16)
plt.tight_layout()
plt.loglog()
plt.show()