Commit edcbb3d9 authored by Constantin Pohl's avatar Constantin Pohl
Browse files

Added TensorFlow classifier training & data

parent 35a2e01d
ALEXANDRIA,29.87560,31.18424
AUGUSTA,15.21417,37.19920
BARCELONA,2.15844,41.35238
CARTAGENA,-0.96793,37.58017
CEUTA,-5.31317,35.89361
DAMIETTA,31.76100,31.46800
DILISKELESI,29.53376,40.76661
FOS SUR MER,4.86513,43.42291
GEMLIK,29.11552,40.42740
GENOVA,8.90911,44.40355
GIBRALTAR,-5.36475,36.14380
HAIFA,35.00391,32.83316
ISKENDERUN,36.17972,36.68401
LIVORNO,10.30616,43.56281
MARSAXLOKK,14.54345,35.82770
MONACO,7.42689,43.73548
NEMRUT,26.90016,38.76553
PALMA DE MALLORCA,2.63237,39.55724
PIRAEUS,23.61056,37.94606
PORT SAID,32.32300,31.24478
TARRAGONA,1.22472,41.10103
TUZLA,29.29471,40.83438
VALENCIA,-0.31647,39.44231
VALLETTA,14.51505,35.89301
YALOVA,29.47632,40.71889
PORT_NAME,LON,LAT,RADIUS
ALEXANDRIA,29.87560,31.18424,13
AUGUSTA,15.21417,37.19920,12
BARCELONA,2.15844,41.35238,15
CARTAGENA,-0.96793,37.58017,12
CEUTA,-5.31317,35.89361,10
DAMIETTA,31.76100,31.46800,12
DILISKELESI,29.53376,40.76661,3.5
FOS SUR MER,4.86513,43.42291,12
GEMLIK,29.11552,40.42740,15
GENOVA,8.90911,44.40355,12
GIBRALTAR,-5.36475,36.14380,10.5
HAIFA,35.00391,32.83316,13.5
ISKENDERUN,36.17972,36.68401,12.5
LIVORNO,10.30616,43.56281,15
MARSAXLOKK,14.54345,35.82770,3.5
MONACO,7.42689,43.73548,12
NEMRUT,26.90016,38.76553,13
PALMA DE MALLORCA,2.63237,39.55724,12
PIRAEUS,23.61056,37.94606,12
PORT SAID,32.32300,31.24478,15.5
TARRAGONA,1.22472,41.10103,12
TUZLA,29.29471,40.83438,13.5
VALENCIA,-0.31647,39.44231,12
VALLETTA,14.51505,35.89301,3.5
YALOVA,29.47632,40.71889,3.5
This diff is collapsed.
#include <iostream>
#include <fstream>
#include <boost/algorithm/string/replace.hpp>
using namespace std;
/*
* This file is for preprocessing the training data (as well as tests).
* It does two things:
* - replacing port names with corresponding integer values (necessary for tensorflow)
* - splitting available data into training and testing (400.000 tuples for training, ~150.000 testing)
*
* Simply build with $g++ splitFile.cpp -o splitFile
* Then run $./splitFile
*/
int main() {
string line;
int cnt = 0;
ifstream inputfile("labeled_data.csv");
ofstream train_outputfile("training_data.csv");
ofstream test_outputfile("testing_data.csv");
if (inputfile.is_open() && train_outputfile.is_open() && test_outputfile.is_open()) {
while (getline(inputfile,line)){
boost::replace_all(line, "ALEXANDRIA","0");
boost::replace_all(line, "AUGUSTA","1");
boost::replace_all(line, "BARCELONA","2");
boost::replace_all(line, "CARTAGENA","3");
boost::replace_all(line, "CEUTA","4");
boost::replace_all(line, "DAMIETTA","5");
boost::replace_all(line, "DILISKELESI","6");
boost::replace_all(line, "FOS SUR MER","7");
boost::replace_all(line, "GEMLIK","8");
boost::replace_all(line, "GENOVA","9");
boost::replace_all(line, "GIBRALTAR","10");
boost::replace_all(line, "HAIFA","11");
boost::replace_all(line, "ISKENDERUN","12");
boost::replace_all(line, "LIVORNO","13");
boost::replace_all(line, "MARSAXLOKK","14");
boost::replace_all(line, "MONACO","15");
boost::replace_all(line, "NEMRUT","16");
boost::replace_all(line, "PALMA DE MALLORCA","17");
boost::replace_all(line, "PIRAEUS","18");
boost::replace_all(line, "PORT SAID","19");
boost::replace_all(line, "TARRAGONA","20");
boost::replace_all(line, "TUZLA","21");
boost::replace_all(line, "VALENCIA","22");
boost::replace_all(line, "VALLETTA","23");
boost::replace_all(line, "YALOVA","24");
if(cnt<400000) {
train_outputfile<<line<<"\n";
} else {
test_outputfile<<line<<"\n";
}
cnt++;
}
inputfile.close();
train_outputfile.close();
test_outputfile.close();
}
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
##
#
# Hint: Port names (e.g. ALEXANDRIA) are replaced by an integer value (e.g. "0" for ALEXANDRIA).
# This includes the DEPARTURE_PORT_NAME and ARRIVAL_PORT_CALC attribute and was necessary because
# Tensorflow labels should be integer, not strings.
#
# TODOs:
# - Better features, currently (for simplicity) only longitude, latitude and course is used.
# - Parametrization: batch size, amount of epochs, splitting algorithm of train/test data, ...
# - Maybe different model; instead of a linear classifier usage of neural networks or others.
#
# To run: Install Tensorflow, then just run $python train_tf.py
#
##
import tensorflow as tf
import os
import csv
import sys
#suppress warnings regarding AVX
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
#attribute names
CSV_HEADER = ['SHIP_ID','SHIPTYPE','SPEED','LON','LAT','COURSE','HEADING','TIMESTAMP','DEPARTURE_PORT_NAME',
'REPORTED_DRAUGHT','ARRIVAL_CALC','ARRIVAL_PORT_CALC']
#default initialization: [''] is string, [0] is int, [0.0] is float
CSV_DATA_DEFAULTS = [[''],[0],[0.0],[0.0],[0.0],[0],[0],[''],[0],[''],[''],[0]]
#map port names to indexes (not necessary anymore, already done in preprocessing step (splitFile.cpp))
#CSV_WORD_TO_INDEX = {'ALEXANDRIA':0, 'AUGUSTA':1, 'BARCELONA':2, 'CARTAGENA':3, 'CEUTA':4, 'DAMIETTA':5,
# 'DILISKELESI':6, 'FOS SUR MER':7, 'GEMLIK':8, 'GENOVA':9, 'GIBRALTAR':10,
# 'HAIFA':11, 'ISKENDERUN':12, 'LIVORNO':13, 'MARSAXLOKK':14, 'MONACO':15,
# 'NEMRUT':16, 'PALMA DE MALLORCA':17, 'PIRAEUS':18, 'PORT SAID':19, 'TARRAGONA':20,
# 'TUZLA':21, 'VALENCIA':22, 'VALLETTA':23, 'YALOVA':24}
#construct features in tensorflow-format
def build_model_columns():
#numeric column because of float value in certain range
#COURSE is the key, necessary to map attribute to feature column
course = tf.feature_column.numeric_column('COURSE')
#see above
lon = tf.feature_column.numeric_column('LON')
lat = tf.feature_column.numeric_column('LAT')
#combine to single variable
base_columns = [course, lon, lat]
return base_columns
#construct estimator for the model (with directory specification)
def build_estimator(model_dir):
#get features
column_model = build_model_columns()
#optional: Use CPU instead of GPU
run_config = tf.estimator.RunConfig().replace(session_config=tf.ConfigProto(device_count={'GPU': 0}))
#return a linear classifier model
return tf.estimator.LinearClassifier(model_dir=model_dir, n_classes=25, feature_columns=column_model, config=run_config)
#get the data, data_file = path to file, num_epochs = , batch_size =
def input_fn(data_file, num_epochs, batch_size):
#check if file exists
assert tf.gfile.Exists(data_file), ('% not found.' % data_file)
#parse string into tensors, value = line in file
def parse_csv(value):
#print directory path
print("Parsing file %s" % data_file)
#convert data into tensors
columns = tf.decode_csv(value, record_defaults=CSV_DATA_DEFAULTS)
#create a dictionary, mapping attribute names (e.g. course) to tensors
features = dict(zip(CSV_HEADER, columns))
#use arrival (port) as label
label = features.pop('ARRIVAL_PORT_CALC')
return features, label
#reading raw file
dataset = tf.data.TextLineDataset(data_file)
#map and parse input strings into features
dataset = dataset.map(parse_csv, num_parallel_calls=5)
#learning finetuning
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
#iterator provides access to features & labels one at a time
iterator = dataset.make_one_shot_iterator()
features, labels = iterator.get_next()
return features, labels
#main
def main(unused_argv):
#parametrization
num_epochs = 5
batch_size = 10
model_dir = "./model/"
train_data = "./train_data/training_data.csv"
test_data = "./train_data/testing_data.csv"
#first, build an estimator (currently a linear classifier)
model = build_estimator(model_dir)
#train the classifier with training data
model.train(input_fn=lambda: input_fn(train_data, num_epochs, batch_size))
#evaluate classifier with test data
results = model.evaluate(input_fn=lambda: input_fn(test_data, 1, batch_size))
#show accuracy and other information
for key in sorted(results):
print('%s: %s' % (key, results[key]))
if __name__ == '__main__':
tf.app.run(main=main, argv=[sys.argv[0]])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment