Commit 126fb3c5 authored by Daniel Seybold's avatar Daniel Seybold

added analytic scripts

parent c9260ec3
Pipeline #92682 passed with stage
in 23 minutes and 46 seconds
import argparse
import glob
import os
import matplotlib
import pandas as pd
from matplotlib import pyplot as plt
class TimeSeriesAggregator:
Main object of the class. Stores many relevant values and data frames used for plotting
def __init__(self):
self.parser = None
self.input_path = None
self.output_path = None
self.marker_pos_x = None
self.label_text = None
self.agg_latency = None
self.agg_throughput = None
self.cols_latency = None
self.cols_throughput = None = -1
self.files = 0
self.min_t = 0
self.max_t = 0
self.min_l = 0
self.max_l = 0
self.plotAverage = False
def parser_setup(self):
sets up a parser that allows to read parameters when executing
#TODO: add output format as parameter: pdf/png and align plotting call
# define input directory with --input/-i and the output path with --output/-o on which
# the resulting plots shall be stored
parser = argparse.ArgumentParser(description='Plot timeseries based on YCSB result files.')
parser.add_argument('--input', '-i', type=str, nargs=1, required=True,
help='path of the folder with the load.txt')
#parser.add_argument('--output', '-o', type=str, nargs=1, required=True,
# help='path to the store timeseries plot')
# optional arguments for marker position at the x axis and the label_text if a marker is given
parser.add_argument('--marker_pos_x', '-pos', type=int, nargs=1, default=None, required=False,
help='fixing a marker at given x position')
parser.add_argument('--label', '-l', type=str, nargs=1, default=None, required=False,
help='label text to display at the marker position (argument marker_pos_x')
# optional argument: run number (the specified run will be shown in the whole context)
parser.add_argument('--run', '-r', type=int, choices=range(0, 10), default=-1, required=False)
parser.add_argument('--plotAverage', '-pa', type=bool, default=False, required=False)
# fix the parser as main parser object of the TimeSeriesAggregator object
self.parser = parser
def parse_input(self):
sets the essential values input_path, output_path, marker_pos_x and label_text for later plotting
# process input parameters
args = self.parser.parse_args()
# define inputPath as the directory with load.txt files
self.input_path = args.input[0]
self.output_path = self.input_path
# create, if necessary a new directory with the given output path (creates as many subdirectories as necessary)
#self.output_path = args.output[0]
#if not os.path.isdir(self.output_path):
# os.makedirs(self.output_path)
# set the marker and label_text if given, otherwise None
if args.marker_pos_x is not None:
self.marker_pos_x = args.marker_pos_x[0]
self.marker_pos_x = None
if args.label is not None:
self.label_text = args.label[0]
self.label_text = None
# get the run number, if given
if is not None: =
if args.plotAverage:
self.plotAverage = True
def extract_input(self):
data is prepared and processed for later plotting
# get all files from the given directory
all_files = glob.glob(os.path.join(self.input_path, "*.txt"))
file_list = []
# attribute that determines how many timestamp-rows will be created
current_max = 0
# create one data frame for each read file
for filename in all_files:
df = pd.read_csv(filename, error_bad_lines=False, sep=";",
names=['timeseries', 'timestamp', 'throughput', 'latency'],
self.files = len(file_list)
# determine the current maximum of timestamps (fixes the length of the data frame later)
maximum_timestamp_of_file = max(df['timestamp'])
if maximum_timestamp_of_file > current_max:
current_max = maximum_timestamp_of_file
# create an empty data frame with only timestamps for latency and throughput
self.cols_latency = ['timestamp'] + ['l{}'.format(i) for i in range(len(all_files))]
self.cols_throughput = ['timestamp'] + ['t{}'.format(i) for i in range(len(all_files))]
agg_frame_latency = pd.DataFrame(columns=self.cols_latency)
agg_frame_latency['timestamp'] = range(10, int(current_max) + 10, 10)
agg_frame_throughput = pd.DataFrame(columns=self.cols_throughput)
agg_frame_throughput['timestamp'] = range(10, int(current_max) + 10, 10)
# fill the new data frames, enumerate columns with file numbers
for index, file in enumerate(file_list):
agg_frame_latency['l{}'.format(index)] = file['latency']
agg_frame_throughput['t{}'.format(index)] = file['throughput']
# change the index so that timestamps are the index
agg_frame_latency.set_index('timestamp', inplace=True)
agg_frame_throughput.set_index('timestamp', inplace=True)
# calculate mean for each row
agg_frame_latency['Mean'] = agg_frame_latency.mean(axis=1, skipna=True)
agg_frame_throughput['Mean'] = agg_frame_throughput.mean(axis=1, skipna=True)
indices_latency = [x for x in agg_frame_latency.columns if x != "Mean"]
indices_throughput = [x for x in agg_frame_throughput.columns if x != "Mean"]
# the factor of the standard deviation is 1/n-1, thus it is clearly specified as ddof=1
# calculate standard deviation per row
agg_frame_latency['Standard Deviation'] = agg_frame_latency[indices_latency].std(axis=1, skipna=True, ddof=1)
agg_frame_throughput['Standard Deviation'] = agg_frame_throughput[indices_throughput].std(axis=1, skipna=True, ddof=1)
# save results in self-object
self.agg_latency = agg_frame_latency
self.agg_throughput = agg_frame_throughput
def plot_all_timeseries(self):
processes the collected data and generates plots out of the given data
# indicate the start of the plotting process
print("Plotting timeseries data...")
#TODO: only clean outliers if values are really outliers, cleaning all values by default will corrupt the data, i.e. determination of an outliers needs to be defined
#TODO: move to its own function
# cleaning of outliers
#for column in self.agg_throughput.columns:
# self.agg_throughput = self.remove_outliers(self.agg_throughput, column)
# determine the min and max values of throughput to adjust the scaling of the graphs later
self.min_t = self.agg_throughput.iloc[:, :-2].min().min()
self.max_t = self.agg_throughput.iloc[:, :-2].max().max()
timestamps_t = self.agg_throughput.index
def remove_outliers(df_in, col_name):
removes data point outliers from the set and adjusts them with help of the interquartile range
:param df_in: the input data frame from which shall be cleaned
:param col_name: the current column name: in which column the calculation shall be performed
:return: a cleaned data frame
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3 - q1 # Interquartile range
fence_low = q1 - 1.5 * iqr
fence_high = q3 + 1.5 * iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
def process(self):
main method. Executes the methods in correct order and terminates after running
if __name__ == "__main__":
from matplotlib import pyplot as plt
import argparse
import glob
import os
import random
import matplotlib
import pandas as pd
import seaborn as sns
import json
import itertools
# sns.color_palette().as_hex()
colors = [
# '#1f77b4',
palette = itertools.cycle(sns.color_palette(colors))
palette_end = itertools.cycle(sns.color_palette(colors))
plt.rcParams[""] = "Times New Roman"
sns.set(style="darkgrid", font='serif', font_scale=0.8)
cbMapping = {
"4" : "low",
"32" : "optimal",
"128" : "overload"
caMapping = {
"4" : "low",
"16" : "optimal",
"64" : "overload"
# Benchmark selection
# # ca-write-dataframes
#charts = ["4", "16", "64"]
#labelMapping = caMapping
#inputPath = "../dataframes/ca-write-dataframes/"
# # ca-read-dataframes
#charts = ["4", "16", "64"]
#labelMapping = caMapping
#inputPath = "../dataframes/ca-read-dataframes/"
# cb-write-dataframes
#charts = ["4" , "32", "128"]
#labelMapping = cbMapping
#inputPath = "../dataframes/cb-write-dataframes/"
# cb-read-dataframes
#charts = ["4" , "32", "128"]
charts = ["128"]
labelMapping = cbMapping
inputPath = "../dataframes/cb-read-dataframes/"
def load_dfs(threads):
iteration = "{}{}-threads".format(inputPath, threads)
print("processing {}".format(iteration))
# Load dumped dataframe
df = pd.read_pickle("{}{}-threads".format(inputPath, threads))
# Remove unnecessary columns (std and mean are determined by seaborn)
df = df.drop(columns=['Mean', 'Standard Deviation'])
df['timestamp'] = df.index
# Transform dataframe group columns into single column (seaborn req)
df = df.melt(id_vars=['timestamp'],
lineLabel = labelMapping[threads]
df['threads'] = " {} ".format(lineLabel)
#df['threads'] = " {} threads".format(threads)
# Drop rows with null values
df = df.dropna()
# Filter out above and below 10-percentile
df = df[df.bandwidth < df.bandwidth.quantile(.95)]
df = df[df.bandwidth > df.bandwidth.quantile(.05)]
return df
def load_events(threads):
# load events from json file to dataframe
iteration = "{}{}-events.json".format(inputPath, threads)
with open(iteration) as json_file:
data = json.load(json_file)
df = pd.DataFrame.from_dict(data)
# calculate diff (duration) and add metainformation
df["diffVM"] = df.diff(axis=1)["endVM"]
df["diffDBMS"] = df.diff(axis=1)["DBMSReady"]
df["threads"] = threads
return df
dfs = []
events = []
for threads in charts:
dfs = pd.concat(dfs, ignore_index=True)
events = pd.concat(events, ignore_index=False)
scale_begin = 180
vm_end = events["diffVM"].mean()
dfs['threads'] = dfs['threads'].astype(str)
# Plot linecharts
ax = sns.lineplot(x="timestamp",
# scaling lines
label='scale-out trigger')
label='VM ready')
#use the average scale-out time for horizontal line
#end_timestamps = dfs.groupby(["threads"], sort=False)["timestamp"].max()
end_timestamps = events.groupby(["threads"], sort=False)["diffDBMS"].mean() + scale_begin + events.groupby(["threads"], sort=False)["diffVM"].mean()
# Ending timestamp viz for each thread config
for name, end_timestamp in end_timestamps.items():
#label=name.replace("(avg)", "(avg) scale-out end")
label="scale-out complete"
#label= labelMapping[name] + " scale-out\ncomplete"
# ax.legend(loc='upper right', ncol=3, borderpad=1)
ax.set_ylabel('average throughput in ops/s')
ax.set_xlabel('runtime in s')
#no title as title will be set via latex
legend = ax.legend()
# remove legend title
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:])
# store the created picture
# save file under the predetermined directory
output_file = os.path.join(inputPath, "fancy_single.pdf")
plt.savefig(output_file, format='pdf')
\ No newline at end of file
#! python3
import pandas as pd
import argparse
import matplotlib.pyplot as plt
from matplotlib import rcParams
import os
import numpy as np
import json
import pprint
import seaborn as sns
import matplotlib
class TimeseriesData:
timestamps = []
operations = []
latencies = []
scaleOutAverage = None
write_base_path = "C:/git/omi-gitlab/mowgli-results/elasticity/overload-calibration/results/write-only/openstack"
write_data_files = [
read_base_path = "C:/git/omi-gitlab/mowgli-results/elasticity/overload-calibration/results/read-only/openstack"
read_data_files = [
def load():
data = []
count = 1
#dirty hack from hell to get as well the read-only data.json files
filecounter = 0
for f_tupl in write_data_files:
path = write_base_path + f_tupl[0]
filtered_json = {}
with open(path, 'r') as file:
raw_file_data =
j = json.loads(raw_file_data)
filtered_json['cassandra-write'] = j['metricsLoadPhase']['throughput']['avg']
filtered_json['cassandra-write_std'] = j['metricsLoadPhase']['throughput']['std']
path = write_base_path + f_tupl[1]
with open(path, 'r') as file:
raw_file_data =
j = json.loads(raw_file_data)
filtered_json['couchbase-write'] = j['metricsLoadPhase']['throughput']['avg']
filtered_json['couchbase-write_std'] = j['metricsLoadPhase']['throughput']['std']
filtered_json["YCSB client threads"] = count
#read-only parts
path = read_base_path + str(read_data_files[filecounter][0])
with open(path, 'r') as file:
raw_file_data =
j = json.loads(raw_file_data)
filtered_json['cassandra-read'] = j['metricsTransactionPhase']['throughput']['avg']
filtered_json['cassandra-read_std'] = j['metricsTransactionPhase']['throughput']['std']
path = read_base_path + str(read_data_files[filecounter][1])
with open(path, 'r') as file:
raw_file_data =
j = json.loads(raw_file_data)
filtered_json['couchbase-read'] = j['metricsTransactionPhase']['throughput']['avg']
filtered_json['couchbase-read_std'] = j['metricsTransactionPhase']['throughput']['std']
count = count*2
filecounter = filecounter + 1
return data
def to_df(json):
df = pd.DataFrame.from_records(json)
df = df.set_index('YCSB client threads')
return df
def plot(df):
rcParams.update({'figure.autolayout': True})
sns.set_context("notebook", font_scale=1)
sns.set_style({'': 'serif'})
bw_tp = df[['cassandra-write','cassandra-read','couchbase-write','couchbase-read']]
bw_tp_std = df[['cassandra-write_std','cassandra-read_std','couchbase-write_std','couchbase-read_std']]
bw_tp_std = bw_tp_std.rename(columns={"cassandra-write_std":"cassandra-write","cassandra-read_std":"cassandra-read","couchbase-write_std":"couchbase-write","couchbase-read_std":"couchbase-read"})
ax =
figsize=(7, 5)
ax.set_ylabel("avg. throughput in ops/s")
ax.set_xlabel("workload intensities by YCSB client threads ")
raw_data = load()
df = to_df(raw_data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment