This page was generated by
nbsphinx from
docs/examples/Caches.ipynb.
Interactive online version:
Speasy caches levels analysis
Only for Google Colab users:
[ ]:
%pip install --upgrade ipympl speasy
[ ]:
try:
from google.colab import output
output.enable_custom_widget_manager()
except:
print("Not running inside Google Collab")
For all users:
[ ]:
import speasy as spz
%matplotlib widget
amda_tree = spz.inventories.tree.amda
# Use this instead if you are not using jupyterlab yet
#%matplotlib notebook
import matplotlib.pyplot as plt
from datetime import datetime
[1]:
First ensure that speasy is setup to use SciQLop cache
[2]:
spz.config.proxy.url.set('http://sciqlop.lpp.polytechnique.fr/cache-dev')
spz.config.proxy.enabled.set(True)
[3]:
start_time = datetime(2016, 6, 2)
stop_time = datetime(2016, 6, 8)
reference_data = spz.get_data(amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, progress=False)
print(f"Data shape: {reference_data.values.shape}")
print(f"Data size in Bytes: {reference_data.nbytes}")
Data shape: (32400, 3)
Data size in Bytes: 1039238
[4]:
def times(f, *args, n=10, **kwargs):
def time_once():
start = time.perf_counter_ns()
f(*args, **kwargs, progress=False)
stop = time.perf_counter_ns()
return (stop - start) / 1e6
return [time_once() for _ in range(n)]
def best_99_percent(times):
return sorted(times)[:int(len(times)*.99)]
def best_90_percent(times):
return sorted(times)[:int(len(times)*.9)]
Cache level comparison
Then request data several times with all 3 configurations:
without any cache, each time speasy will download data from AMDA
with remote cache only, each time speasy download data from our remote cache hosted here
with local cache, each time after the first request speasy will load data from your disk
[5]:
durations_without_any_cache = times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time,
disable_cache=True, disable_proxy=True, n=10);
durations_with_remote_cache = times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time,
disable_cache=True, n=1000);
durations_with_local_cache = times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time,
n=10000);
[6]:
fig, axs = plt.subplots(3, 1, figsize=(6, 10))
for i, data, title in ((0, best_99_percent(durations_without_any_cache), 'Without any cache'),
(1, best_99_percent(durations_with_remote_cache), 'With only SciQLop remote cache'),
(2, best_99_percent(durations_with_local_cache), 'With local on disk cache')):
axs[i].hist(data)
axs[i].set_xlabel('Execution time (ms)')
axs[i].set_title(title)
fig.suptitle('Execution time distributions for each conf', fontsize=16)
plt.tight_layout()
plt.show()
[7]:
fig, ax = plt.subplots()
ax.violinplot([best_99_percent(durations_without_any_cache), best_99_percent(durations_with_remote_cache), best_99_percent(durations_with_local_cache), ], showmeans=False,
showmedians=True)
ax.set_xticks([1, 2, 3], labels=['without', 'remote', 'local'])
ax.set_ylabel('Execution time (ms)')
plt.semilogy()
fig.suptitle('Comparison plot', fontsize=16)
plt.tight_layout()
plt.show()
Scaling
On disk cache scaling
[8]:
start_time = datetime(2016, 6, 2)
def scaling_point(delta):
stop_time = start_time + timedelta(hours=delta)
data = spz.get_data(amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time)
capacity = data.nbytes
t = best_90_percent(times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, n=200))
return capacity, t
deltas = np.logspace(np.log10(100), np.log10(5000), num=10)
values = [scaling_point(delta) for delta in deltas]
fig, ax = plt.subplots()
stats = [t for c, t in values]
capacities = np.array([c for c, t in values])
ax.violinplot(stats, positions=capacities, widths=np.gradient(capacities), showmeans=False, showmedians=True)
ax.set_ylabel('Execution time (ms)')
ax.set_xlabel('Loaded data size (Bytes)')
fig.suptitle('On disk cache scaling', fontsize=16)
plt.tight_layout()
plt.loglog()
plt.show()
SciQLop remote cache scaling
[9]:
start_time = datetime(2016, 6, 2)
def scaling_point(delta):
stop_time = start_time + timedelta(hours=delta)
data = spz.get_data(amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, disable_cache=True)
capacity = data.nbytes
t = best_90_percent(times(spz.get_data, amda_tree.Parameters.ACE.MFI.ace_imf_all.imf, start_time, stop_time, disable_cache=True,
n=20))
return capacity, t
deltas = np.logspace(np.log10(100), np.log10(5000), num=10)
values = [scaling_point(delta) for delta in deltas]
fig, ax = plt.subplots()
stats = [t for c, t in values]
capacities = np.array([c for c, t in values])
ax.violinplot(stats, positions=capacities, widths=np.gradient(capacities), showmeans=False, showmedians=True)
ax.set_ylabel('Execution time (ms)')
ax.set_xlabel('Loaded data size (Bytes)')
fig.suptitle('SciQLop remote cache scaling', fontsize=16)
plt.tight_layout()
plt.loglog()
plt.show()