Commit 2d1c3eb3 authored by Alexander Baumstark's avatar Alexander Baumstark
Browse files

Initial commit

parents
cmake_minimum_required(VERSION 3.16)
project(arrow)
MESSAGE(STATUS "Arrow root = " ${ARROW_ROOT})
set(ARROW_LIBS libarrow.so)
include_directories(${ARROW_INCLIDE_DIRS})
link_directories(${ARROW_LINK_DIRS})
set(CMAKE_CXX_STANDARD 17)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/lib)
add_library(cs_lib
lib/core/Core.cpp
lib/core/Core.h
lib/column/GenericColumnCursor.cpp
lib/column/GenericColumnCursor.h
lib/column/BaseColumnCursor.cpp
lib/column/BaseColumnCursor.h
lib/column/ChunkedColumnCursor.cpp
lib/column/ChunkedColumnCursor.h
lib/column/ColumnBuilder.cpp
lib/column/ColumnBuilder.h
lib/table/TableCursor.cpp
lib/table/TableCursor.h
lib/table/Table.cpp
lib/table/Table.h
lib/database/Database.cpp
lib/database/Database.cpp
lib/index/Comparer.h
lib/index/Index.h)
target_include_directories (cs_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
add_executable(arrow main.cpp)
target_link_libraries(arrow LINK_PUBLIC cs_lib ${ARROW_LIBS} ${CMAKE_THREAD_LIBS_INIT})
add_executable(bench bench/db_bench.cpp)
target_link_libraries(bench ${ARROW_LIBS} ${CMAKE_THREAD_LIBS_INIT})
Include(FetchContent)
FetchContent_Declare(
Catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
GIT_TAG v2.13.1)
FetchContent_MakeAvailable(Catch2)
add_executable(tests test/database_test.cpp test/index_test.cpp)
target_link_libraries(tests cs_lib ${ARROW_LIBS} ${CMAKE_THREAD_LIBS_INIT} Catch2::Catch2 )
FROM ubuntu:latest
RUN apt-get -y update && apt-get install -y
RUN apt install -y -V build-essential gcc ca-certificates lsb-release wget git
RUN wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
RUN apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
RUN apt update
RUN apt install -y -V libarrow-dev
COPY . /usr/src/arrow_db
WORKDIR /usr/src/arrow_db
#include <iostream>
int main() {
std::cout << "Hello bench" << std::endl;
}
#include "BaseColumnCursor.h"
#include "ChunkedColumnCursor.h"
namespace db {
template<typename T>
BaseColumnCursor<T>::BaseColumnCursor(TableCursor &table_cursor)
: GenericColumnCursor(table_cursor) {}
template<typename T>
std::shared_ptr<BaseColumnCursor<T>>
BaseColumnCursor<T>::makeCursor(
std::shared_ptr<arrow::ChunkedArray> column, ColumnEncoding encoding, TableCursor &table_cursor)
{
switch (encoding) {
case db::ColumnEncoding::PLAIN: {
return std::make_shared<ChunkedColumnCursor<T>>(column, table_cursor);
}
case db::ColumnEncoding::DICT: {
//return std::make_shared<ChunkedDictColumnCursor<T>>(column, table_cursor);
}
default:
return std::make_shared<ChunkedColumnCursor<T>>(column, table_cursor);
}
}
}
template class db::BaseColumnCursor<db::IntType>;
template class db::BaseColumnCursor<db::StringType>;
\ No newline at end of file
#ifndef ARROW_BASECOLUMNCURSOR_H
#define ARROW_BASECOLUMNCURSOR_H
#include <memory>
#include <arrow/api.h>
#include "../core/Core.h"
#include "GenericColumnCursor.h"
namespace db {
class TableCursor;
template<typename T>
class BaseColumnCursor : public GenericColumnCursor {
public:
explicit BaseColumnCursor(TableCursor &table_cursor);
static std::shared_ptr<BaseColumnCursor<T>> makeCursor(
std::shared_ptr<arrow::ChunkedArray> column, ColumnEncoding encoding, TableCursor &table_cursor);
/**
* Get value at current position.
* @return
*/
virtual typename T::ElementType get() = 0;
protected:
/**
* Seek to the given position.
* @param to zero-based ordinal position of element in column
* @return True if successful.
*/
virtual bool seek(uint64_t to) = 0;
/**
* Will next() produce another element?
* @return
*/
virtual bool hasMore() = 0;
/**
* Move to the next element.
* @return True if an element is available, false otherwise (end of column.)
*/
virtual bool next() = 0;
};
};
#endif //ARROW_BASECOLUMNCURSOR_H
#include <iostream>
#include "ChunkedColumnCursor.h"
#include "../core/Core.h"
namespace db {
template<typename T>
ChunkedColumnCursor<T>::ChunkedColumnCursor(std::shared_ptr<arrow::ChunkedArray> column, TableCursor &table_cursor)
: BaseColumnCursor<T>(table_cursor), _column(std::move(column)) {
// std::cout << "Cursor: [" << _column->data()->num_chunks() << "]" << std::endl;
reset();
}
template<typename T>
bool
ChunkedColumnCursor<T>::hasMore() {
return (_pos + 1) < _column->length();
}
template<typename T>
bool
ChunkedColumnCursor<T>::next() {
if ((_pos + 1) < _column->length()) {
_pos++;
_pos_in_chunk++;
// may have hit the end of the current chunk
if (_pos_in_chunk >= _current_chunk->length()) {
// invariant: if this could fail (we are ignoring the return) it would have been caught above
// TODO: still check the invariant as it's cheap
advance_chunk();
}
return true;
} else {
return false;
}
}
template<typename T>
bool
ChunkedColumnCursor<T>::isNull() {
seek(this->get_pos());
return _current_chunk->IsNull(_pos_in_chunk);
}
template<typename T>
typename T::ElementType
ChunkedColumnCursor<T>::get() {
seek(this->get_pos());
return _current_chunk->Value(_pos_in_chunk);
}
template<>
typename db::StringType::ElementType
ChunkedColumnCursor<db::StringType>::get() {
seek(this->get_pos());
return _current_chunk->GetString(_pos_in_chunk);
}
template<typename T>
void
ChunkedColumnCursor<T>::reset() {
_pos = 0;
_chunk = 0;
_pos_in_chunk = 0;
_current_chunk =
std::static_pointer_cast<typename T::ArrayType>(_column->chunk(_chunk));
// TODO: this may fail if the column is empty
}
template<typename T>
bool
ChunkedColumnCursor<T>::seek(uint64_t to) {
// the key idea here is to avoid touching the memory of the intervening chunks completely
int64_t distance = to - _pos;
while (_pos_in_chunk + distance >= _current_chunk->length()) {
int64_t advancing = _current_chunk->length() - _pos_in_chunk;
distance -= advancing;
if (!advance_chunk()) return false;
_pos += advancing;
}
// invariant: there's enough data since the loop exited and advance_chunk() returned true
_pos += distance;
_pos_in_chunk += distance;
//std::cout << to << " << " << _pos << " , " << _pos_in_chunk << " >>" << std::endl;
return true;
}
template<typename T>
bool
ChunkedColumnCursor<T>::advance_chunk() {
if ((_chunk + 1) < _column->num_chunks()) {
_chunk++;
_pos_in_chunk = 0;
_current_chunk =
std::static_pointer_cast<typename T::ArrayType>(_column->chunk(_chunk));
return true;
} else {
return false;
}
}
};
template class db::ChunkedColumnCursor<db::IntType>;
template class db::ChunkedColumnCursor<db::StringType>;
\ No newline at end of file
#ifndef ARROW_CHUNKEDCOLUMNCURSOR_H
#define ARROW_CHUNKEDCOLUMNCURSOR_H
#include <arrow/table.h>
#include "BaseColumnCursor.h"
namespace db {
/**
* A simple column cursor implemented on top of a possibly chunked Arrow column, the hides the
* chunking to present a simpel column structure. This is not directly used for executing queries.
*
* @tparam T The underlying Arrow array type:: for example, arrow::Int64Array.
*/
template<typename T>
class ChunkedColumnCursor : public BaseColumnCursor<T> {
public:
/**
* Create from a column -- initially positioned at first element, if any.
* @param column
*/
explicit ChunkedColumnCursor(std::shared_ptr<arrow::ChunkedArray> column, TableCursor &table_cursor);
/**
* Will next() produce another element?
* @return
*/
bool hasMore();
/**
* Move to the next element.
* @return True if an element is available, false otherwise (end of column.)
*/
bool next();
/**
* Is the element at the current position null?
* @return
*/
bool isNull();
/**
* Get value at current position.
* @return
*/
typename T::ElementType get();
/**
* Reset to the first element, if any.
*/
void reset();
/**
* Seek to the given position.
* @param to zero-based ordinal position of element in column
* @return True if successful.
*/
bool seek(uint64_t to);
protected:
/**
* Advance to the next chunk in the column's chunk sequence, when the values
* in the current chunk have been exhausted.
* @return True if successful, false if the current chunk was the last.
*/
bool advance_chunk();
private:
/**
* The underlying column
*/
std::shared_ptr<arrow::ChunkedArray> _column;
/**
* The current chunk of the underlying column
*/
std::shared_ptr<typename T::ArrayType> _current_chunk;
/**
* Offset of current chunk inthe sequence of chunks
*/
int32_t _chunk = 0;
/**
* Offset within the current chunk
*/
int64_t _pos_in_chunk = 0;
/**
* Position within the (logical) column.
*/
int64_t _pos = 0;
};
};
#endif //ARROW_CHUNKEDCOLUMNCURSOR_H
#include <iostream>
#include "ColumnBuilder.h"
using namespace db;
template<typename T>
ColumnBuilder<T>::ColumnBuilder(std::shared_ptr<arrow::Field> field,
db::ColumnEncoding encoding,
arrow::MemoryPool *pool) {
enc_ = encoding;
// erstelle Builder entsprechend des Encodings
if(encoding == db::ColumnEncoding::PLAIN) {
builder_.reset(new typename T::BuilderType(pool));
}
have_data_ = false;
field_ = field;
}
template < typename T>
arrow::Status ColumnBuilder<T>::add(std::shared_ptr<db::GenValue> value) {
return add(std::dynamic_pointer_cast<db::Value<typename T::ElementType>>(value)->get());
}
template < typename T>
arrow::Status ColumnBuilder<T>::addNull() {
have_data_ = true;
return builder_->AppendNull();
}
template < typename T>
arrow::Status ColumnBuilder<T>::add(typename T::ElementType element) {
have_data_ = true;
return builder_->Append(element);
}
template < typename T>
void ColumnBuilder<T>::endChunk() {
if(have_data_ || chunks_.size() == 0) {
std::shared_ptr<arrow::Array> array;
auto status = builder_->Finish(&array);
chunks_.push_back(array);
}
have_data_ = false;
}
template < typename T>
std::shared_ptr<arrow::ChunkedArray> ColumnBuilder<T>::getColumn() {
endChunk();
return std::make_shared<arrow::ChunkedArray>(chunks_);
}
template class db::ColumnBuilder<db::IntType>;
template class db::ColumnBuilder<db::StringType>;
//TODO: DoubleType
\ No newline at end of file
#ifndef ARROW_COLUMNBUILDER_H
#define ARROW_COLUMNBUILDER_H
#include <arrow/api.h>
#include "../core/Core.h"
/**
* ColumnBuilder ist eine Klasse zum Erzeugen von Columns, welche auf den ChunkedArray Typ basieren.
* Für jede Column muss der entsprechende Typ und Codierung angegeben werden.
*/
namespace db {
// Das Interface eines ColumnBuilder
class GenColumnBuilder {
// Methode zum hinzufügen von Nullwerden
virtual arrow::Status addNull() = 0;
public:
// Methode zum hinzufügen eines Values
virtual arrow::Status add(std::shared_ptr<db::GenValue> value) = 0;
// Methode zum beenden des aktuellen Chunks und hinzufügen eines neuen Chunks
virtual void endChunk() = 0;
// getter um das ChunkedArray zu erhalten
virtual std::shared_ptr<arrow::ChunkedArray> getColumn() = 0;
virtual ~GenColumnBuilder() = default;
};
// generischer ColumnBuilder
template<typename T>
class ColumnBuilder: public GenColumnBuilder {
public:
// Constructor erhält die Feldbezeichnungen, Codierung und einen Memory-Pool worin die Daten abgelegt werden
explicit ColumnBuilder(std::shared_ptr<arrow::Field> field,
db::ColumnEncoding encoding,
arrow::MemoryPool *pool = arrow::default_memory_pool());
arrow::Status add(std::shared_ptr<db::GenValue> value) override;
void endChunk() override;
std::shared_ptr<arrow::ChunkedArray> getColumn() override;
protected:
arrow::Status add(typename T::ElementType element);
arrow::Status addNull() override;
private:
// Flag ob Column Daten enthält
bool have_data_;
db::ColumnEncoding enc_;
// Arrow Builder mit entsprechenden Column Type
std::unique_ptr<typename T::BuilderType> builder_;
//std::unique_ptr<typename T::DictBuilderType> dictBuilder_;
// Vector welcher die Chunks des Column enthält = Daten
arrow::ArrayVector chunks_;
std::shared_ptr<arrow::Field> field_;
};
}
#endif //ARROW_COLUMNBUILDER_H
#include "GenericColumnCursor.h"
#include "../table/TableCursor.h"
using namespace db;
using namespace db;
GenericColumnCursor::GenericColumnCursor(TableCursor &table_cursor)
: _table_cursor(table_cursor)
{
}
int
GenericColumnCursor::get_pos()
{
return _table_cursor.get_pos();
}
\ No newline at end of file
#ifndef ARROW_GENERICCOLUMNCURSOR_H
#define ARROW_GENERICCOLUMNCURSOR_H
namespace db {
class TableCursor;
/**
* Access to columns, controlled by a TableCursor. Obtain one of these by
* calling getColumn on your outermost TableCursor, and use that TableCursor's hasMore()
* method to iterate. TO get data out of one of these, cast it to the right kind of
* ColumnCursorWrapper and call get().
*/
class GenericColumnCursor {
friend class ScanTableCursor;
public:
virtual ~GenericColumnCursor() = default;
/**
* Is the element at the current position null?
* @return
*/
virtual bool isNull() = 0;
protected:
explicit GenericColumnCursor(TableCursor &table_cursor);
/**
* Reset to the first element, if any.
*/
virtual void reset() = 0;
int get_pos();
private:
TableCursor &_table_cursor;
};
};
#endif //ARROW_GENERICCOLUMNCURSOR_H
#include "Core.h"
std::shared_ptr<::db::DataType> db::string_type() {
return std::make_shared<::db::StringType>();
}
std::shared_ptr<::db::DataType> db::int_type() {
return std::make_shared<::db::IntType>();
}
//TODO: double_type()
std::shared_ptr<::db::GenValue> db::int_val(int64_t i) {
return std::make_shared<::db::Value<int64_t>>(i);
}
std::shared_ptr<::db::GenValue> db::str_val(std::string s) {
return std::make_shared<::db::Value<std::string>>(s);
}
//TODO: double_val
\ No newline at end of file
#ifndef ARROW_CORE_H
#define ARROW_CORE_H
#include <arrow/api.h>
#include <memory>
#include <iostream>
// namespace für Datenbank-Implementierung
namespace db {
// Unterstützte Column Typen TODO: DOUBLE
enum class ColumnType {
INT, STRING
};
// Value Encoding in Columns
enum class ColumnEncoding { PLAIN = 0, DICT };
// Virtuelle Basisklasse für Datentypen einer Column
class DataType {
public:
virtual ~DataType() = default;
virtual ColumnType id() = 0;
virtual std::shared_ptr<arrow::DataType> getArrowType() = 0;
};
// Integer Typ für Columns mit Ganzzahlen als Value -> 64bit !
class IntType : public DataType {
public:
// Typdefinitionen
// ArrayType eines Columns
using ArrayType = arrow::NumericArray<arrow::Int64Type>;
// entsprechende C++ Typ -> int64
using ElementType = arrow::Int64Type::c_type;
// Builder um Werte zu erzeugen
using BuilderType = arrow::Int64Builder;
// DictionaryBuilder für DICT encodingg
using DictionaryBuilderType = arrow::DictionaryBuilder<arrow::Int64Type>;
// Mapping von DataType zu ColumnType enum
const ColumnType TYPE_ID = ::db::ColumnType::INT;
// Getter für TYPE_ID