Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
code
pfabric
Commits
edcbb3d9
Commit
edcbb3d9
authored
Feb 08, 2018
by
Constantin Pohl
Browse files
Added TensorFlow classifier training & data
parent
35a2e01d
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
src/DEBS2018/data/ports.csv
View file @
edcbb3d9
ALEXANDRIA,29.87560,31.18424
AUGUSTA,15.21417,37.19920
BARCELONA,2.15844,41.35238
CARTAGENA,-0.96793,37.58017
CEUTA,-5.31317,35.89361
DAMIETTA,31.76100,31.46800
DILISKELESI,29.53376,40.76661
FOS SUR MER,4.86513,43.42291
GEMLIK,29.11552,40.42740
GENOVA,8.90911,44.40355
GIBRALTAR,-5.36475,36.14380
HAIFA,35.00391,32.83316
ISKENDERUN,36.17972,36.68401
LIVORNO,10.30616,43.56281
MARSAXLOKK,14.54345,35.82770
MONACO,7.42689,43.73548
NEMRUT,26.90016,38.76553
PALMA DE MALLORCA,2.63237,39.55724
PIRAEUS,23.61056,37.94606
PORT SAID,32.32300,31.24478
TARRAGONA,1.22472,41.10103
TUZLA,29.29471,40.83438
VALENCIA,-0.31647,39.44231
VALLETTA,14.51505,35.89301
YALOVA,29.47632,40.71889
PORT_NAME,LON,LAT,RADIUS
ALEXANDRIA,29.87560,31.18424,13
AUGUSTA,15.21417,37.19920,12
BARCELONA,2.15844,41.35238,15
CARTAGENA,-0.96793,37.58017,12
CEUTA,-5.31317,35.89361,10
DAMIETTA,31.76100,31.46800,12
DILISKELESI,29.53376,40.76661,3.5
FOS SUR MER,4.86513,43.42291,12
GEMLIK,29.11552,40.42740,15
GENOVA,8.90911,44.40355,12
GIBRALTAR,-5.36475,36.14380,10.5
HAIFA,35.00391,32.83316,13.5
ISKENDERUN,36.17972,36.68401,12.5
LIVORNO,10.30616,43.56281,15
MARSAXLOKK,14.54345,35.82770,3.5
MONACO,7.42689,43.73548,12
NEMRUT,26.90016,38.76553,13
PALMA DE MALLORCA,2.63237,39.55724,12
PIRAEUS,23.61056,37.94606,12
PORT SAID,32.32300,31.24478,15.5
TARRAGONA,1.22472,41.10103,12
TUZLA,29.29471,40.83438,13.5
VALENCIA,-0.31647,39.44231,12
VALLETTA,14.51505,35.89301,3.5
YALOVA,29.47632,40.71889,3.5
src/DEBS2018/training/train_data/labeled_data.csv
0 → 100644
View file @
edcbb3d9
This diff is collapsed.
Click to expand it.
src/DEBS2018/training/train_data/splitFile.cpp
0 → 100755
View file @
edcbb3d9
#include <iostream>
#include <fstream>
#include <boost/algorithm/string/replace.hpp>
using
namespace
std
;
/*
* This file is for preprocessing the training data (as well as tests).
* It does two things:
* - replacing port names with corresponding integer values (necessary for tensorflow)
* - splitting available data into training and testing (400.000 tuples for training, ~150.000 testing)
*
* Simply build with $g++ splitFile.cpp -o splitFile
* Then run $./splitFile
*/
int
main
()
{
string
line
;
int
cnt
=
0
;
ifstream
inputfile
(
"labeled_data.csv"
);
ofstream
train_outputfile
(
"training_data.csv"
);
ofstream
test_outputfile
(
"testing_data.csv"
);
if
(
inputfile
.
is_open
()
&&
train_outputfile
.
is_open
()
&&
test_outputfile
.
is_open
())
{
while
(
getline
(
inputfile
,
line
)){
boost
::
replace_all
(
line
,
"ALEXANDRIA"
,
"0"
);
boost
::
replace_all
(
line
,
"AUGUSTA"
,
"1"
);
boost
::
replace_all
(
line
,
"BARCELONA"
,
"2"
);
boost
::
replace_all
(
line
,
"CARTAGENA"
,
"3"
);
boost
::
replace_all
(
line
,
"CEUTA"
,
"4"
);
boost
::
replace_all
(
line
,
"DAMIETTA"
,
"5"
);
boost
::
replace_all
(
line
,
"DILISKELESI"
,
"6"
);
boost
::
replace_all
(
line
,
"FOS SUR MER"
,
"7"
);
boost
::
replace_all
(
line
,
"GEMLIK"
,
"8"
);
boost
::
replace_all
(
line
,
"GENOVA"
,
"9"
);
boost
::
replace_all
(
line
,
"GIBRALTAR"
,
"10"
);
boost
::
replace_all
(
line
,
"HAIFA"
,
"11"
);
boost
::
replace_all
(
line
,
"ISKENDERUN"
,
"12"
);
boost
::
replace_all
(
line
,
"LIVORNO"
,
"13"
);
boost
::
replace_all
(
line
,
"MARSAXLOKK"
,
"14"
);
boost
::
replace_all
(
line
,
"MONACO"
,
"15"
);
boost
::
replace_all
(
line
,
"NEMRUT"
,
"16"
);
boost
::
replace_all
(
line
,
"PALMA DE MALLORCA"
,
"17"
);
boost
::
replace_all
(
line
,
"PIRAEUS"
,
"18"
);
boost
::
replace_all
(
line
,
"PORT SAID"
,
"19"
);
boost
::
replace_all
(
line
,
"TARRAGONA"
,
"20"
);
boost
::
replace_all
(
line
,
"TUZLA"
,
"21"
);
boost
::
replace_all
(
line
,
"VALENCIA"
,
"22"
);
boost
::
replace_all
(
line
,
"VALLETTA"
,
"23"
);
boost
::
replace_all
(
line
,
"YALOVA"
,
"24"
);
if
(
cnt
<
400000
)
{
train_outputfile
<<
line
<<
"
\n
"
;
}
else
{
test_outputfile
<<
line
<<
"
\n
"
;
}
cnt
++
;
}
inputfile
.
close
();
train_outputfile
.
close
();
test_outputfile
.
close
();
}
}
\ No newline at end of file
src/DEBS2018/training/train_data/testing_data.csv
0 → 100644
View file @
edcbb3d9
This diff is collapsed.
Click to expand it.
src/DEBS2018/training/train_data/training_data.csv
0 → 100644
View file @
edcbb3d9
This diff is collapsed.
Click to expand it.
src/DEBS2018/training/train_tf.py
0 → 100644
View file @
edcbb3d9
##
#
# Hint: Port names (e.g. ALEXANDRIA) are replaced by an integer value (e.g. "0" for ALEXANDRIA).
# This includes the DEPARTURE_PORT_NAME and ARRIVAL_PORT_CALC attribute and was necessary because
# Tensorflow labels should be integer, not strings.
#
# TODOs:
# - Better features, currently (for simplicity) only longitude, latitude and course is used.
# - Parametrization: batch size, amount of epochs, splitting algorithm of train/test data, ...
# - Maybe different model; instead of a linear classifier usage of neural networks or others.
#
# To run: Install Tensorflow, then just run $python train_tf.py
#
##
import
tensorflow
as
tf
import
os
import
csv
import
sys
#suppress warnings regarding AVX
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'2'
#attribute names
CSV_HEADER
=
[
'SHIP_ID'
,
'SHIPTYPE'
,
'SPEED'
,
'LON'
,
'LAT'
,
'COURSE'
,
'HEADING'
,
'TIMESTAMP'
,
'DEPARTURE_PORT_NAME'
,
'REPORTED_DRAUGHT'
,
'ARRIVAL_CALC'
,
'ARRIVAL_PORT_CALC'
]
#default initialization: [''] is string, [0] is int, [0.0] is float
CSV_DATA_DEFAULTS
=
[[
''
],[
0
],[
0.0
],[
0.0
],[
0.0
],[
0
],[
0
],[
''
],[
0
],[
''
],[
''
],[
0
]]
#map port names to indexes (not necessary anymore, already done in preprocessing step (splitFile.cpp))
#CSV_WORD_TO_INDEX = {'ALEXANDRIA':0, 'AUGUSTA':1, 'BARCELONA':2, 'CARTAGENA':3, 'CEUTA':4, 'DAMIETTA':5,
# 'DILISKELESI':6, 'FOS SUR MER':7, 'GEMLIK':8, 'GENOVA':9, 'GIBRALTAR':10,
# 'HAIFA':11, 'ISKENDERUN':12, 'LIVORNO':13, 'MARSAXLOKK':14, 'MONACO':15,
# 'NEMRUT':16, 'PALMA DE MALLORCA':17, 'PIRAEUS':18, 'PORT SAID':19, 'TARRAGONA':20,
# 'TUZLA':21, 'VALENCIA':22, 'VALLETTA':23, 'YALOVA':24}
#construct features in tensorflow-format
def
build_model_columns
():
#numeric column because of float value in certain range
#COURSE is the key, necessary to map attribute to feature column
course
=
tf
.
feature_column
.
numeric_column
(
'COURSE'
)
#see above
lon
=
tf
.
feature_column
.
numeric_column
(
'LON'
)
lat
=
tf
.
feature_column
.
numeric_column
(
'LAT'
)
#combine to single variable
base_columns
=
[
course
,
lon
,
lat
]
return
base_columns
#construct estimator for the model (with directory specification)
def
build_estimator
(
model_dir
):
#get features
column_model
=
build_model_columns
()
#optional: Use CPU instead of GPU
run_config
=
tf
.
estimator
.
RunConfig
().
replace
(
session_config
=
tf
.
ConfigProto
(
device_count
=
{
'GPU'
:
0
}))
#return a linear classifier model
return
tf
.
estimator
.
LinearClassifier
(
model_dir
=
model_dir
,
n_classes
=
25
,
feature_columns
=
column_model
,
config
=
run_config
)
#get the data, data_file = path to file, num_epochs = , batch_size =
def
input_fn
(
data_file
,
num_epochs
,
batch_size
):
#check if file exists
assert
tf
.
gfile
.
Exists
(
data_file
),
(
'% not found.'
%
data_file
)
#parse string into tensors, value = line in file
def
parse_csv
(
value
):
#print directory path
print
(
"Parsing file %s"
%
data_file
)
#convert data into tensors
columns
=
tf
.
decode_csv
(
value
,
record_defaults
=
CSV_DATA_DEFAULTS
)
#create a dictionary, mapping attribute names (e.g. course) to tensors
features
=
dict
(
zip
(
CSV_HEADER
,
columns
))
#use arrival (port) as label
label
=
features
.
pop
(
'ARRIVAL_PORT_CALC'
)
return
features
,
label
#reading raw file
dataset
=
tf
.
data
.
TextLineDataset
(
data_file
)
#map and parse input strings into features
dataset
=
dataset
.
map
(
parse_csv
,
num_parallel_calls
=
5
)
#learning finetuning
dataset
=
dataset
.
repeat
(
num_epochs
)
dataset
=
dataset
.
batch
(
batch_size
)
#iterator provides access to features & labels one at a time
iterator
=
dataset
.
make_one_shot_iterator
()
features
,
labels
=
iterator
.
get_next
()
return
features
,
labels
#main
def
main
(
unused_argv
):
#parametrization
num_epochs
=
5
batch_size
=
10
model_dir
=
"./model/"
train_data
=
"./train_data/training_data.csv"
test_data
=
"./train_data/testing_data.csv"
#first, build an estimator (currently a linear classifier)
model
=
build_estimator
(
model_dir
)
#train the classifier with training data
model
.
train
(
input_fn
=
lambda
:
input_fn
(
train_data
,
num_epochs
,
batch_size
))
#evaluate classifier with test data
results
=
model
.
evaluate
(
input_fn
=
lambda
:
input_fn
(
test_data
,
1
,
batch_size
))
#show accuracy and other information
for
key
in
sorted
(
results
):
print
(
'%s: %s'
%
(
key
,
results
[
key
]))
if
__name__
==
'__main__'
:
tf
.
app
.
run
(
main
=
main
,
argv
=
[
sys
.
argv
[
0
]])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment