Commit e7d9b0c3 authored by Daniel Seybold's avatar Daniel Seybold

added new plotting scripts

parent a6a43dcf
Pipeline #53562 passed with stage
in 11 minutes and 4 seconds
#! python3
import pandas as pd
import argparse
import matplotlib.pyplot as plt
from matplotlib import rcParams
import os
import numpy as np
import json
import pprint
import seaborn as sns
import matplotlib
matplotlib.use('Agg')
class TimeseriesData:
timestamps = []
operations = []
latencies = []
scaleOutAverage = None
base_path = "C:/git/omi-gitlab/mowgli-results/elasticity/overload-calibration/results/sensorstorage/openstack"
data_files = [
["/cassandra/nodes-3_replication-3_write-one_threads-1/plots/data.json",
"/couchbase/nodes-3_replication-3_consistency-none_threads-1/plots/data.json"],
["/cassandra/nodes-3_replication-3_write-one_threads-2/plots/data.json",
"/couchbase/nodes-3_replication-3_consistency-none_threads-2/plots/data.json"],
["/cassandra/nodes-3_replication-3_write-one_threads-4/plots/data.json",
"/couchbase/nodes-3_replication-3_consistency-none_threads-4/plots/data.json"],
["/cassandra/nodes-3_replication-3_write-one_threads-8/plots/data.json",
"/couchbase/nodes-3_replication-3_consistency-none_threads-8/plots/data.json"],
["/cassandra/nodes-3_replication-3_write-one_threads-16/plots/data.json",
"/couchbase/nodes-3_replication-3_consistency-none_threads-16/plots/data.json"],
["/cassandra/nodes-3_replication-3_write-one_threads-32/plots/data.json",
"/couchbase/nodes-3_replication-3_consistency-none_threads-32/plots/data.json"],
["/cassandra/nodes-3_replication-3_write-one_threads-64/plots/data.json",
"/couchbase/nodes-3_replication-3_consistency-none_threads-64/plots/data.json"],
["/cassandra/nodes-3_replication-3_write-one_threads-128/plots/data.json",
"/couchbase/nodes-3_replication-3_consistency-none_threads-128/plots/data.json"]
]
matplotlib.pyplot.autoscale()
def load():
data = []
count = 1
for f_tupl in data_files:
path = base_path + f_tupl[0]
filtered_json = {}
with open(path, 'r') as file:
raw_file_data = file.read()
j = json.loads(raw_file_data)
filtered_json['cassandra'] = j['metricsLoadPhase']['throughput']['avg']
filtered_json['cassandra_std'] = j['metricsLoadPhase']['throughput']['std']
path = base_path + f_tupl[1]
with open(path, 'r') as file:
raw_file_data = file.read()
j = json.loads(raw_file_data)
filtered_json['couchbase'] = j['metricsLoadPhase']['throughput']['avg']
filtered_json['couchbase_std'] = j['metricsLoadPhase']['throughput']['std']
filtered_json["YCSB client threads"] = count
data.append(filtered_json)
count = count*2
return data
def to_df(json):
df = pd.DataFrame.from_records(json)
df = df.set_index('YCSB client threads')
return df
def plot(df):
rcParams.update({'figure.autolayout': True})
sns.set(style="darkgrid")
sns.set_context("notebook", font_scale=1)
sns.set_style({'font.family': 'serif'})
sns.set_palette("muted")
bw_tp = df[['cassandra','couchbase']]
bw_tp_std = df[['cassandra_std','couchbase_std']]
bw_tp_std = bw_tp_std.rename(columns={"cassandra_std":"cassandra","couchbase_std":"couchbase"})
ax = bw_tp.plot.bar(
title="avg. write throughput - 3 node Cassandra/Couchbase cluster",
yerr=bw_tp_std,
rot=90,
capsize=2,
figsize=(7, 5)
)
ax.set_ylabel("Ops/s")
plt.savefig("./test.png")
plt.savefig("./test.pdf")
raw_data = load()
df = to_df(raw_data)
print(df)
plot(df)
import argparse
import glob
import os
import fnmatch
import matplotlib
import pandas as pd
......@@ -50,8 +51,6 @@ class TimeSeriesAggregator:
help='fixing a marker at given x position')
parser.add_argument('--label', '-l', type=str, nargs=1, default=None, required=False,
help='label text to display at the marker position (argument marker_pos_x')
# optional argument: run number (the specified run will be shown in the whole context)
parser.add_argument('--run', '-r', type=int, choices=range(0, 10), default=-1, required=False)
parser.add_argument('--plotAverage', '-pa', type=bool, default=False, required=False)
# fix the parser as main parser object of the TimeSeriesAggregator object
self.parser = parser
......@@ -64,6 +63,9 @@ class TimeSeriesAggregator:
args = self.parser.parse_args()
# define inputPath as the directory with load.txt files
self.input_path = args.input[0]
# derive the number of runs based on the number of *.txt files in the folder
self.run = len(fnmatch.filter(os.listdir(self.input_path), '*.txt'))
self.output_path = self.input_path
# create, if necessary a new directory with the given output path (creates as many subdirectories as necessary)
......@@ -79,10 +81,7 @@ class TimeSeriesAggregator:
self.label_text = args.label[0]
else:
self.label_text = None
# get the run number, if given
if args.run is not None:
self.run = args.run
if args.plotAverage:
self.plotAverage = True
......@@ -185,8 +184,9 @@ class TimeSeriesAggregator:
# draw the standard deviation as area around the mean
mean_t = self.agg_throughput["Mean"]
std_t = self.agg_throughput["Standard Deviation"]
plt.fill_between(timestamps_t, mean_t - 2 * std_t,
mean_t + 2 * std_t, color='b', alpha=0.2, label='Standard Deviation')
# TODO: check why there was used the 2 * std_l? Removed in current version and only used std_l!!
plt.fill_between(timestamps_t, mean_t - std_t,
mean_t + std_t, color='b', alpha=0.2, label='Standard Deviation')
# modify default legend
ax.legend(loc='lower center', ncol=3, borderpad=1)
# adjust size of the figure
......@@ -204,8 +204,8 @@ class TimeSeriesAggregator:
#TODO: move to its own function
# cleaning of outliers
# cleaning of outliers
#for column in self.agg_latency.columns:
# self.agg_latency = self.remove_outliers(self.agg_latency, column)
for column in self.agg_latency.columns:
self.agg_latency = self.remove_outliers(self.agg_latency, column)
# determine the min and max values of latency to adjust the scaling of the graphs later
......@@ -234,8 +234,10 @@ class TimeSeriesAggregator:
# draw the standard deviation as area around the mean
mean_l = self.agg_latency["Mean"]
std_l = self.agg_latency["Standard Deviation"]
plt.fill_between(timestamps_l, mean_l - 2 * std_l,
mean_l + 2 * std_l, color='b', alpha=0.2, label='Standard Deviation')
# TODO: check why there was used the 2 * std_l? Removed in current version and only used std_l!!
plt.fill_between(timestamps_l, mean_l - std_l,
mean_l + std_l, color='b', alpha=0.2, label='Standard Deviation')
# modify default legend
ax.legend(loc='lower center', ncol=3, borderpad=1)
# adjust size of the figure
......@@ -253,12 +255,21 @@ class TimeSeriesAggregator:
:param col_name: the current column name: in which column the calculation shall be performed
:return: a cleaned data frame
"""
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3 - q1 # Interquartile range
fence_low = q1 - 1.5 * iqr
fence_high = q3 + 1.5 * iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
#q1 = df_in[col_name].quantile(0.25)
#q3 = df_in[col_name].quantile(0.75)
#iqr = q3 - q1 # Interquartile range
#fence_low = q1 - 1.5 * iqr
#fence_high = q3 + 1.5 * iqr
#df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
#alternative outlier removal
# Drop rows with null values
df_in = df_in.dropna()
# Filter out above and below 10-percentile
df_in = df_in[df_in[col_name] < df_in[col_name].quantile(.95)]
df_in = df_in[df_in[col_name] > df_in[col_name].quantile(.05)]
df_out = df_in
return df_out
def process(self):
......
import argparse
import glob
import os
import matplotlib
import pandas as pd
import seaborn as sns
matplotlib.use('Agg')
from matplotlib import pyplot as plt
charts = ["4","32","128"]
inputPath = "C:/mowgli/cb-write-dataframes/"
plt.rcParams["font.family"] = "Times New Roman"
def read_and_plot(run):
df1=pd.read_pickle(inputPath + "{}-threads".format(run))
print(df1)
#sns.lineplot(x=df1.index , y="Mean", hue="Standard Deviation",data=df1)
#plt.savefig("./testplot.pdf", format='pdf')
#plt.close()
timestamps_t = df1.index
mean_name= "avg-{}-threads".format(run)
std_name= "std-{}".format(run)
df1 = df1.rename(columns={"Mean":mean_name})
df1 = df1.rename(columns={"Standard Deviation":std_name})
ax = df1[mean_name].plot()
# draw scale-out trigger only once
if run == charts[0]:
ax.axvline(180, color='red', linestyle=':', label='scale-out')
# draw the standard deviation as area around the mean
mean_t = df1[mean_name]
std_t = df1[std_name]
#optional: label=std_name
plt.fill_between(timestamps_t, mean_t - 2 * std_t,
mean_t + 2 * std_t, alpha=0.2)
# modify default legend
ax.legend(loc='upper right', ncol=3, borderpad=1)
ax.set_ylabel('throughput in ops/s')
ax.set_xlabel('runtime in s')
maxY = df1[mean_name].max() + 2000
plt.ylim(0,maxY)
# adjust size of the figure
plt.plot(figsize=(1000, 600))
for run in charts:
read_and_plot(run)
# store the created picture
# save file under the predetermined directory
output_file = os.path.join(inputPath,"merged.pdf")
plt.savefig(output_file, format='pdf')
plt.close()
\ No newline at end of file
import argparse
import glob
import os
import fnmatch
import matplotlib
import pandas as pd
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import plotting_config
class TimeSeriesAggregator:
"""
Main object of the class. Stores many relevant values and data frames used for plotting
"""
def __init__(self):
self.parser = None
self.input_path = None
self.output_path = None
self.agg_latency = None
self.agg_throughput = None
self.cols_latency = None
self.cols_throughput = None
self.run = -1
self.threads = 1
self.files = 0
self.min_t = 0
self.max_t = 0
self.min_l = 0
self.max_l = 0
def parser_setup(self):
"""
sets up a parser that allows to read parameters when executing
"""
#TODO: add output format as parameter: pdf/png and align plotting call
# define input directory with --input/-i and the output path with --output/-o on which
# the resulting plots shall be stored
parser = argparse.ArgumentParser(description='Plot timeseries based on YCSB result files.')
parser.add_argument('--input', '-i', type=str, nargs=1, required=True,
help='path of the folder with the load.txt or transaction.txt')
#parser.add_argument('--output', '-o', type=str, nargs=1, required=True,
# help='path to the store timeseries plot')
# optional argument: run number (the specified run will be shown in the whole context)
parser.add_argument('--threads', '-t', type=int, choices=range(1, 129), required=True)
# fix the parser as main parser object of the TimeSeriesAggregator object
self.parser = parser
def parse_input(self):
# process input parameters
args = self.parser.parse_args()
# define inputPath as the directory with load.txt files
self.input_path = args.input[0]
self.output_path = self.input_path
# derive the number of runs based on the number of *.txt files in the folder
self.run = len(fnmatch.filter(os.listdir(self.input_path), '*.txt'))
self.threads = args.threads
def extract_input(self):
"""
data is prepared and processed for later plotting
"""
# get all files from the given directory
all_files = glob.glob(os.path.join(self.input_path, "*.txt"))
file_list = []
# attribute that determines how many timestamp-rows will be created
current_max = 0
# create one data frame for each read file
for filename in all_files:
df = pd.read_csv(filename, error_bad_lines=False, sep=";",
names=['timeseries', 'timestamp', 'throughput', 'latency'],
index_col=False)
file_list.append(df)
self.files = len(file_list)
# determine the current maximum of timestamps (fixes the length of the data frame later)
maximum_timestamp_of_file = max(df['timestamp'])
if maximum_timestamp_of_file > current_max:
current_max = maximum_timestamp_of_file
# create an empty data frame with only timestamps for latency and throughput
self.cols_latency = ['timestamp'] + ['l{}'.format(i) for i in range(len(all_files))]
self.cols_throughput = ['timestamp'] + ['t{}'.format(i) for i in range(len(all_files))]
agg_frame_latency = pd.DataFrame(columns=self.cols_latency)
agg_frame_latency['timestamp'] = range(10, int(current_max) + 10, 10)
agg_frame_throughput = pd.DataFrame(columns=self.cols_throughput)
agg_frame_throughput['timestamp'] = range(10, int(current_max) + 10, 10)
# fill the new data frames, enumerate columns with file numbers
for index, file in enumerate(file_list):
agg_frame_latency['l{}'.format(index)] = file['latency']
agg_frame_throughput['t{}'.format(index)] = file['throughput']
# change the index so that timestamps are the index
agg_frame_latency.set_index('timestamp', inplace=True)
agg_frame_throughput.set_index('timestamp', inplace=True)
# calculate mean for each row
agg_frame_latency['Mean'] = agg_frame_latency.mean(axis=1, skipna=True)
agg_frame_throughput['Mean'] = agg_frame_throughput.mean(axis=1, skipna=True)
indices_latency = [x for x in agg_frame_latency.columns if x != "Mean"]
indices_throughput = [x for x in agg_frame_throughput.columns if x != "Mean"]
# the factor of the standard deviation is 1/n-1, thus it is clearly specified as ddof=1
# calculate standard deviation per row
agg_frame_latency['Standard Deviation'] = agg_frame_latency[indices_latency].std(axis=1, skipna=True, ddof=1)
agg_frame_throughput['Standard Deviation'] = agg_frame_throughput[indices_throughput].std(axis=1, skipna=True, ddof=1)
# save results in self-object
self.agg_latency = agg_frame_latency
self.agg_throughput = agg_frame_throughput
def extract_dataframe(self):
"""
processes the collected data and generates plots out of the given data
"""
# THROUGHPUT
# indicate the start of the plotting process
print("Extracting timeseries data...")
self.min_t = self.agg_throughput.iloc[:, :-2].min().min()
self.max_t = self.agg_throughput.iloc[:, :-2].max().max()
timestamps_t = self.agg_throughput.index
filename = str(self.threads) + "-threads"
output = os.path.join(self.output_path,filename)
print(self.agg_throughput)
self.agg_throughput.to_pickle(output)
def process(self):
"""
main method. Executes the methods in correct order and terminates after running
"""
self.parser_setup()
self.parse_input()
self.extract_input()
self.extract_dataframe()
if __name__ == "__main__":
TimeSeriesAggregator().process()
# aggregation directory
LOAD_AGGREGATION_DIRECTORY = "plots/timeseriesLoadData"
TRANSACTION_AGGREGATION_DIRECTORY = "plots/timeseriesTransactionData"
# data properties
DATA_DIRECTORY = "data/"
DATA_LOAD_FILE = "load.txt"
DATA_TRANSACTION_FILE = "transaction.txt"
......@@ -2011,6 +2011,8 @@ definitions:
type: number
format: int32
##################### Templatesa ##################################
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment