Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
E
evaluation-orchestrator
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
8
Issues
8
List
Boards
Labels
Service Desk
Milestones
Iterations
Merge Requests
0
Merge Requests
0
Requirements
Requirements
List
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issue
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
mowgli
evaluation-orchestrator
Commits
e7d9b0c3
Commit
e7d9b0c3
authored
Aug 12, 2019
by
Daniel Seybold
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added new plotting scripts
parent
a6a43dcf
Pipeline
#53562
passed with stage
in 11 minutes and 4 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
360 additions
and
18 deletions
+360
-18
plotting/boxplots/pandasBarplot.py
plotting/boxplots/pandasBarplot.py
+105
-0
plotting/timeseries/aggregateTimeseries.py
plotting/timeseries/aggregateTimeseries.py
+29
-18
plotting/timeseries/merger.py
plotting/timeseries/merger.py
+61
-0
plotting/utils/extractDataFrame.py
plotting/utils/extractDataFrame.py
+154
-0
plotting/utils/plotting_config.py
plotting/utils/plotting_config.py
+9
-0
swagger/swagger.yaml
swagger/swagger.yaml
+2
-0
No files found.
plotting/boxplots/pandasBarplot.py
0 → 100644
View file @
e7d9b0c3
#! python3
import
pandas
as
pd
import
argparse
import
matplotlib.pyplot
as
plt
from
matplotlib
import
rcParams
import
os
import
numpy
as
np
import
json
import
pprint
import
seaborn
as
sns
import
matplotlib
matplotlib
.
use
(
'Agg'
)
class
TimeseriesData
:
timestamps
=
[]
operations
=
[]
latencies
=
[]
scaleOutAverage
=
None
base_path
=
"C:/git/omi-gitlab/mowgli-results/elasticity/overload-calibration/results/sensorstorage/openstack"
data_files
=
[
[
"/cassandra/nodes-3_replication-3_write-one_threads-1/plots/data.json"
,
"/couchbase/nodes-3_replication-3_consistency-none_threads-1/plots/data.json"
],
[
"/cassandra/nodes-3_replication-3_write-one_threads-2/plots/data.json"
,
"/couchbase/nodes-3_replication-3_consistency-none_threads-2/plots/data.json"
],
[
"/cassandra/nodes-3_replication-3_write-one_threads-4/plots/data.json"
,
"/couchbase/nodes-3_replication-3_consistency-none_threads-4/plots/data.json"
],
[
"/cassandra/nodes-3_replication-3_write-one_threads-8/plots/data.json"
,
"/couchbase/nodes-3_replication-3_consistency-none_threads-8/plots/data.json"
],
[
"/cassandra/nodes-3_replication-3_write-one_threads-16/plots/data.json"
,
"/couchbase/nodes-3_replication-3_consistency-none_threads-16/plots/data.json"
],
[
"/cassandra/nodes-3_replication-3_write-one_threads-32/plots/data.json"
,
"/couchbase/nodes-3_replication-3_consistency-none_threads-32/plots/data.json"
],
[
"/cassandra/nodes-3_replication-3_write-one_threads-64/plots/data.json"
,
"/couchbase/nodes-3_replication-3_consistency-none_threads-64/plots/data.json"
],
[
"/cassandra/nodes-3_replication-3_write-one_threads-128/plots/data.json"
,
"/couchbase/nodes-3_replication-3_consistency-none_threads-128/plots/data.json"
]
]
matplotlib
.
pyplot
.
autoscale
()
def
load
():
data
=
[]
count
=
1
for
f_tupl
in
data_files
:
path
=
base_path
+
f_tupl
[
0
]
filtered_json
=
{}
with
open
(
path
,
'r'
)
as
file
:
raw_file_data
=
file
.
read
()
j
=
json
.
loads
(
raw_file_data
)
filtered_json
[
'cassandra'
]
=
j
[
'metricsLoadPhase'
][
'throughput'
][
'avg'
]
filtered_json
[
'cassandra_std'
]
=
j
[
'metricsLoadPhase'
][
'throughput'
][
'std'
]
path
=
base_path
+
f_tupl
[
1
]
with
open
(
path
,
'r'
)
as
file
:
raw_file_data
=
file
.
read
()
j
=
json
.
loads
(
raw_file_data
)
filtered_json
[
'couchbase'
]
=
j
[
'metricsLoadPhase'
][
'throughput'
][
'avg'
]
filtered_json
[
'couchbase_std'
]
=
j
[
'metricsLoadPhase'
][
'throughput'
][
'std'
]
filtered_json
[
"YCSB client threads"
]
=
count
data
.
append
(
filtered_json
)
count
=
count
*
2
return
data
def
to_df
(
json
):
df
=
pd
.
DataFrame
.
from_records
(
json
)
df
=
df
.
set_index
(
'YCSB client threads'
)
return
df
def
plot
(
df
):
rcParams
.
update
({
'figure.autolayout'
:
True
})
sns
.
set
(
style
=
"darkgrid"
)
sns
.
set_context
(
"notebook"
,
font_scale
=
1
)
sns
.
set_style
({
'font.family'
:
'serif'
})
sns
.
set_palette
(
"muted"
)
bw_tp
=
df
[[
'cassandra'
,
'couchbase'
]]
bw_tp_std
=
df
[[
'cassandra_std'
,
'couchbase_std'
]]
bw_tp_std
=
bw_tp_std
.
rename
(
columns
=
{
"cassandra_std"
:
"cassandra"
,
"couchbase_std"
:
"couchbase"
})
ax
=
bw_tp
.
plot
.
bar
(
title
=
"avg. write throughput - 3 node Cassandra/Couchbase cluster"
,
yerr
=
bw_tp_std
,
rot
=
90
,
capsize
=
2
,
figsize
=
(
7
,
5
)
)
ax
.
set_ylabel
(
"Ops/s"
)
plt
.
savefig
(
"./test.png"
)
plt
.
savefig
(
"./test.pdf"
)
raw_data
=
load
()
df
=
to_df
(
raw_data
)
print
(
df
)
plot
(
df
)
plotting/timeseries/aggregateTimeseries.py
View file @
e7d9b0c3
import
argparse
import
glob
import
os
import
fnmatch
import
matplotlib
import
pandas
as
pd
...
...
@@ -50,8 +51,6 @@ class TimeSeriesAggregator:
help
=
'fixing a marker at given x position'
)
parser
.
add_argument
(
'--label'
,
'-l'
,
type
=
str
,
nargs
=
1
,
default
=
None
,
required
=
False
,
help
=
'label text to display at the marker position (argument marker_pos_x'
)
# optional argument: run number (the specified run will be shown in the whole context)
parser
.
add_argument
(
'--run'
,
'-r'
,
type
=
int
,
choices
=
range
(
0
,
10
),
default
=-
1
,
required
=
False
)
parser
.
add_argument
(
'--plotAverage'
,
'-pa'
,
type
=
bool
,
default
=
False
,
required
=
False
)
# fix the parser as main parser object of the TimeSeriesAggregator object
self
.
parser
=
parser
...
...
@@ -64,6 +63,9 @@ class TimeSeriesAggregator:
args
=
self
.
parser
.
parse_args
()
# define inputPath as the directory with load.txt files
self
.
input_path
=
args
.
input
[
0
]
# derive the number of runs based on the number of *.txt files in the folder
self
.
run
=
len
(
fnmatch
.
filter
(
os
.
listdir
(
self
.
input_path
),
'*.txt'
))
self
.
output_path
=
self
.
input_path
# create, if necessary a new directory with the given output path (creates as many subdirectories as necessary)
...
...
@@ -79,10 +81,7 @@ class TimeSeriesAggregator:
self
.
label_text
=
args
.
label
[
0
]
else
:
self
.
label_text
=
None
# get the run number, if given
if
args
.
run
is
not
None
:
self
.
run
=
args
.
run
if
args
.
plotAverage
:
self
.
plotAverage
=
True
...
...
@@ -185,8 +184,9 @@ class TimeSeriesAggregator:
# draw the standard deviation as area around the mean
mean_t
=
self
.
agg_throughput
[
"Mean"
]
std_t
=
self
.
agg_throughput
[
"Standard Deviation"
]
plt
.
fill_between
(
timestamps_t
,
mean_t
-
2
*
std_t
,
mean_t
+
2
*
std_t
,
color
=
'b'
,
alpha
=
0.2
,
label
=
'Standard Deviation'
)
# TODO: check why there was used the 2 * std_l? Removed in current version and only used std_l!!
plt
.
fill_between
(
timestamps_t
,
mean_t
-
std_t
,
mean_t
+
std_t
,
color
=
'b'
,
alpha
=
0.2
,
label
=
'Standard Deviation'
)
# modify default legend
ax
.
legend
(
loc
=
'lower center'
,
ncol
=
3
,
borderpad
=
1
)
# adjust size of the figure
...
...
@@ -204,8 +204,8 @@ class TimeSeriesAggregator:
#TODO: move to its own function
# cleaning of outliers
# cleaning of outliers
#
for column in self.agg_latency.columns:
#
self.agg_latency = self.remove_outliers(self.agg_latency, column)
for
column
in
self
.
agg_latency
.
columns
:
self
.
agg_latency
=
self
.
remove_outliers
(
self
.
agg_latency
,
column
)
# determine the min and max values of latency to adjust the scaling of the graphs later
...
...
@@ -234,8 +234,10 @@ class TimeSeriesAggregator:
# draw the standard deviation as area around the mean
mean_l
=
self
.
agg_latency
[
"Mean"
]
std_l
=
self
.
agg_latency
[
"Standard Deviation"
]
plt
.
fill_between
(
timestamps_l
,
mean_l
-
2
*
std_l
,
mean_l
+
2
*
std_l
,
color
=
'b'
,
alpha
=
0.2
,
label
=
'Standard Deviation'
)
# TODO: check why there was used the 2 * std_l? Removed in current version and only used std_l!!
plt
.
fill_between
(
timestamps_l
,
mean_l
-
std_l
,
mean_l
+
std_l
,
color
=
'b'
,
alpha
=
0.2
,
label
=
'Standard Deviation'
)
# modify default legend
ax
.
legend
(
loc
=
'lower center'
,
ncol
=
3
,
borderpad
=
1
)
# adjust size of the figure
...
...
@@ -253,12 +255,21 @@ class TimeSeriesAggregator:
:param col_name: the current column name: in which column the calculation shall be performed
:return: a cleaned data frame
"""
q1
=
df_in
[
col_name
].
quantile
(
0.25
)
q3
=
df_in
[
col_name
].
quantile
(
0.75
)
iqr
=
q3
-
q1
# Interquartile range
fence_low
=
q1
-
1.5
*
iqr
fence_high
=
q3
+
1.5
*
iqr
df_out
=
df_in
.
loc
[(
df_in
[
col_name
]
>
fence_low
)
&
(
df_in
[
col_name
]
<
fence_high
)]
#q1 = df_in[col_name].quantile(0.25)
#q3 = df_in[col_name].quantile(0.75)
#iqr = q3 - q1 # Interquartile range
#fence_low = q1 - 1.5 * iqr
#fence_high = q3 + 1.5 * iqr
#df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
#alternative outlier removal
# Drop rows with null values
df_in
=
df_in
.
dropna
()
# Filter out above and below 10-percentile
df_in
=
df_in
[
df_in
[
col_name
]
<
df_in
[
col_name
].
quantile
(.
95
)]
df_in
=
df_in
[
df_in
[
col_name
]
>
df_in
[
col_name
].
quantile
(.
05
)]
df_out
=
df_in
return
df_out
def
process
(
self
):
...
...
plotting/timeseries/merger.py
0 → 100644
View file @
e7d9b0c3
import
argparse
import
glob
import
os
import
matplotlib
import
pandas
as
pd
import
seaborn
as
sns
matplotlib
.
use
(
'Agg'
)
from
matplotlib
import
pyplot
as
plt
charts
=
[
"4"
,
"32"
,
"128"
]
inputPath
=
"C:/mowgli/cb-write-dataframes/"
plt
.
rcParams
[
"font.family"
]
=
"Times New Roman"
def
read_and_plot
(
run
):
df1
=
pd
.
read_pickle
(
inputPath
+
"{}-threads"
.
format
(
run
))
print
(
df1
)
#sns.lineplot(x=df1.index , y="Mean", hue="Standard Deviation",data=df1)
#plt.savefig("./testplot.pdf", format='pdf')
#plt.close()
timestamps_t
=
df1
.
index
mean_name
=
"avg-{}-threads"
.
format
(
run
)
std_name
=
"std-{}"
.
format
(
run
)
df1
=
df1
.
rename
(
columns
=
{
"Mean"
:
mean_name
})
df1
=
df1
.
rename
(
columns
=
{
"Standard Deviation"
:
std_name
})
ax
=
df1
[
mean_name
].
plot
()
# draw scale-out trigger only once
if
run
==
charts
[
0
]:
ax
.
axvline
(
180
,
color
=
'red'
,
linestyle
=
':'
,
label
=
'scale-out'
)
# draw the standard deviation as area around the mean
mean_t
=
df1
[
mean_name
]
std_t
=
df1
[
std_name
]
#optional: label=std_name
plt
.
fill_between
(
timestamps_t
,
mean_t
-
2
*
std_t
,
mean_t
+
2
*
std_t
,
alpha
=
0.2
)
# modify default legend
ax
.
legend
(
loc
=
'upper right'
,
ncol
=
3
,
borderpad
=
1
)
ax
.
set_ylabel
(
'throughput in ops/s'
)
ax
.
set_xlabel
(
'runtime in s'
)
maxY
=
df1
[
mean_name
].
max
()
+
2000
plt
.
ylim
(
0
,
maxY
)
# adjust size of the figure
plt
.
plot
(
figsize
=
(
1000
,
600
))
for
run
in
charts
:
read_and_plot
(
run
)
# store the created picture
# save file under the predetermined directory
output_file
=
os
.
path
.
join
(
inputPath
,
"merged.pdf"
)
plt
.
savefig
(
output_file
,
format
=
'pdf'
)
plt
.
close
()
\ No newline at end of file
plotting/utils/extractDataFrame.py
0 → 100644
View file @
e7d9b0c3
import
argparse
import
glob
import
os
import
fnmatch
import
matplotlib
import
pandas
as
pd
matplotlib
.
use
(
'Agg'
)
from
matplotlib
import
pyplot
as
plt
import
plotting_config
class
TimeSeriesAggregator
:
"""
Main object of the class. Stores many relevant values and data frames used for plotting
"""
def
__init__
(
self
):
self
.
parser
=
None
self
.
input_path
=
None
self
.
output_path
=
None
self
.
agg_latency
=
None
self
.
agg_throughput
=
None
self
.
cols_latency
=
None
self
.
cols_throughput
=
None
self
.
run
=
-
1
self
.
threads
=
1
self
.
files
=
0
self
.
min_t
=
0
self
.
max_t
=
0
self
.
min_l
=
0
self
.
max_l
=
0
def
parser_setup
(
self
):
"""
sets up a parser that allows to read parameters when executing
"""
#TODO: add output format as parameter: pdf/png and align plotting call
# define input directory with --input/-i and the output path with --output/-o on which
# the resulting plots shall be stored
parser
=
argparse
.
ArgumentParser
(
description
=
'Plot timeseries based on YCSB result files.'
)
parser
.
add_argument
(
'--input'
,
'-i'
,
type
=
str
,
nargs
=
1
,
required
=
True
,
help
=
'path of the folder with the load.txt or transaction.txt'
)
#parser.add_argument('--output', '-o', type=str, nargs=1, required=True,
# help='path to the store timeseries plot')
# optional argument: run number (the specified run will be shown in the whole context)
parser
.
add_argument
(
'--threads'
,
'-t'
,
type
=
int
,
choices
=
range
(
1
,
129
),
required
=
True
)
# fix the parser as main parser object of the TimeSeriesAggregator object
self
.
parser
=
parser
def
parse_input
(
self
):
# process input parameters
args
=
self
.
parser
.
parse_args
()
# define inputPath as the directory with load.txt files
self
.
input_path
=
args
.
input
[
0
]
self
.
output_path
=
self
.
input_path
# derive the number of runs based on the number of *.txt files in the folder
self
.
run
=
len
(
fnmatch
.
filter
(
os
.
listdir
(
self
.
input_path
),
'*.txt'
))
self
.
threads
=
args
.
threads
def
extract_input
(
self
):
"""
data is prepared and processed for later plotting
"""
# get all files from the given directory
all_files
=
glob
.
glob
(
os
.
path
.
join
(
self
.
input_path
,
"*.txt"
))
file_list
=
[]
# attribute that determines how many timestamp-rows will be created
current_max
=
0
# create one data frame for each read file
for
filename
in
all_files
:
df
=
pd
.
read_csv
(
filename
,
error_bad_lines
=
False
,
sep
=
";"
,
names
=
[
'timeseries'
,
'timestamp'
,
'throughput'
,
'latency'
],
index_col
=
False
)
file_list
.
append
(
df
)
self
.
files
=
len
(
file_list
)
# determine the current maximum of timestamps (fixes the length of the data frame later)
maximum_timestamp_of_file
=
max
(
df
[
'timestamp'
])
if
maximum_timestamp_of_file
>
current_max
:
current_max
=
maximum_timestamp_of_file
# create an empty data frame with only timestamps for latency and throughput
self
.
cols_latency
=
[
'timestamp'
]
+
[
'l{}'
.
format
(
i
)
for
i
in
range
(
len
(
all_files
))]
self
.
cols_throughput
=
[
'timestamp'
]
+
[
't{}'
.
format
(
i
)
for
i
in
range
(
len
(
all_files
))]
agg_frame_latency
=
pd
.
DataFrame
(
columns
=
self
.
cols_latency
)
agg_frame_latency
[
'timestamp'
]
=
range
(
10
,
int
(
current_max
)
+
10
,
10
)
agg_frame_throughput
=
pd
.
DataFrame
(
columns
=
self
.
cols_throughput
)
agg_frame_throughput
[
'timestamp'
]
=
range
(
10
,
int
(
current_max
)
+
10
,
10
)
# fill the new data frames, enumerate columns with file numbers
for
index
,
file
in
enumerate
(
file_list
):
agg_frame_latency
[
'l{}'
.
format
(
index
)]
=
file
[
'latency'
]
agg_frame_throughput
[
't{}'
.
format
(
index
)]
=
file
[
'throughput'
]
# change the index so that timestamps are the index
agg_frame_latency
.
set_index
(
'timestamp'
,
inplace
=
True
)
agg_frame_throughput
.
set_index
(
'timestamp'
,
inplace
=
True
)
# calculate mean for each row
agg_frame_latency
[
'Mean'
]
=
agg_frame_latency
.
mean
(
axis
=
1
,
skipna
=
True
)
agg_frame_throughput
[
'Mean'
]
=
agg_frame_throughput
.
mean
(
axis
=
1
,
skipna
=
True
)
indices_latency
=
[
x
for
x
in
agg_frame_latency
.
columns
if
x
!=
"Mean"
]
indices_throughput
=
[
x
for
x
in
agg_frame_throughput
.
columns
if
x
!=
"Mean"
]
# the factor of the standard deviation is 1/n-1, thus it is clearly specified as ddof=1
# calculate standard deviation per row
agg_frame_latency
[
'Standard Deviation'
]
=
agg_frame_latency
[
indices_latency
].
std
(
axis
=
1
,
skipna
=
True
,
ddof
=
1
)
agg_frame_throughput
[
'Standard Deviation'
]
=
agg_frame_throughput
[
indices_throughput
].
std
(
axis
=
1
,
skipna
=
True
,
ddof
=
1
)
# save results in self-object
self
.
agg_latency
=
agg_frame_latency
self
.
agg_throughput
=
agg_frame_throughput
def
extract_dataframe
(
self
):
"""
processes the collected data and generates plots out of the given data
"""
# THROUGHPUT
# indicate the start of the plotting process
print
(
"Extracting timeseries data..."
)
self
.
min_t
=
self
.
agg_throughput
.
iloc
[:,
:
-
2
].
min
().
min
()
self
.
max_t
=
self
.
agg_throughput
.
iloc
[:,
:
-
2
].
max
().
max
()
timestamps_t
=
self
.
agg_throughput
.
index
filename
=
str
(
self
.
threads
)
+
"-threads"
output
=
os
.
path
.
join
(
self
.
output_path
,
filename
)
print
(
self
.
agg_throughput
)
self
.
agg_throughput
.
to_pickle
(
output
)
def
process
(
self
):
"""
main method. Executes the methods in correct order and terminates after running
"""
self
.
parser_setup
()
self
.
parse_input
()
self
.
extract_input
()
self
.
extract_dataframe
()
if
__name__
==
"__main__"
:
TimeSeriesAggregator
().
process
()
plotting/utils/plotting_config.py
0 → 100644
View file @
e7d9b0c3
# aggregation directory
LOAD_AGGREGATION_DIRECTORY
=
"plots/timeseriesLoadData"
TRANSACTION_AGGREGATION_DIRECTORY
=
"plots/timeseriesTransactionData"
# data properties
DATA_DIRECTORY
=
"data/"
DATA_LOAD_FILE
=
"load.txt"
DATA_TRANSACTION_FILE
=
"transaction.txt"
swagger/swagger.yaml
View file @
e7d9b0c3
...
...
@@ -2011,6 +2011,8 @@ definitions:
type
:
number
format
:
int32
##################### Templatesa ##################################
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment