1 #ifndef COLIBRIDATATYPES_H
2 #define COLIBRIDATATYPES_H
45 in->read( (
char*) &sentence,
sizeof(uint32_t));
46 in->read( (
char*) &token,
sizeof(uint16_t));
52 void write(std::ostream * out)
const {
53 out->write( (
char*) &sentence,
sizeof(uint32_t));
54 out->write( (
char*) &token,
sizeof(uint16_t));
59 }
else if (sentence == other.
sentence) {
60 return (token < other.
token);
73 return std::to_string((
unsigned int) sentence) +
":" + std::to_string((
unsigned int) token);
88 std::vector<IndexReference>
data;
91 void write(std::ostream * out)
const;
95 return std::binary_search(this->
begin(), this->
end(), ref);
97 return std::find(this->
begin(), this->
end(), ref) != this->
end();
104 unsigned int count()
const {
return data.size(); }
107 size_t size()
const {
return data.size(); }
109 typedef std::vector<IndexReference>::iterator
iterator;
112 iterator
begin() {
return data.begin(); }
113 const_iterator
begin()
const {
return data.begin(); }
115 iterator
end() {
return data.end(); }
116 const_iterator
end()
const {
return data.end(); }
126 for (const_iterator iter = this->
begin(); iter != this->
end(); iter++) {
136 std::set<IndexReference>
set()
const {
137 return std::set<IndexReference>(this->
begin(), this->
end() );
144 std::sort(this->
begin(), this->
end());
151 data.shrink_to_fit();
167 template<
class ValueType>
170 virtual std::string
id() {
return "AbstractValueHandler"; }
171 virtual void read(std::istream * in, ValueType & value)=0;
172 virtual void write(std::ostream * out, ValueType & value)=0;
173 virtual std::string
tostring(ValueType & value)=0;
174 virtual unsigned int count(ValueType & value)
const =0;
177 virtual void convertto(ValueType * source, ValueType* & target )
const { target = source; };
184 template<
class ValueType>
187 virtual std::string
id() {
return "BaseValueHandler"; }
189 void read(std::istream * in, ValueType & v) {
190 in->read( (
char*) &v,
sizeof(ValueType));
192 void write(std::ostream * out, ValueType & value) {
193 out->write( (
char*) &value,
sizeof(ValueType));
198 unsigned int count(ValueType & value)
const {
199 return (
unsigned int) value;
205 void convertto(ValueType * source, ValueType* & target )
const { target = source; };
221 virtual std::string
id() {
return "IndexedDataHandler"; }
224 in->read((
char*) &c,
sizeof(uint32_t));
226 for (
unsigned int i = 0; i < c; i++) {
233 const uint32_t c = value.
count();
234 out->write((
char*) &c,
sizeof(uint32_t));
243 if (!s.empty()) s +=
" ";
244 s += iter->tostring();
249 return value.
data.size();
253 std::cerr <<
"ValueHandler: Value is NULL!" << std::endl;
259 void convertto(
IndexedData * value,
unsigned int * & convertedvalue)
const { convertedvalue =
new unsigned int; *convertedvalue = value->
count(); };
269 template<
class FeatureType>
297 in->read((
char*) &c,
sizeof(uint16_t));
299 for (
unsigned int i = 0; i < c; i++) {
301 in->read((
char*) &f,
sizeof(FeatureType));
304 data.shrink_to_fit();
307 this->pattern.
write(out);
308 unsigned int s = data.size();
310 std::cerr <<
"ERROR: PatternFeatureVector size exceeds maximum 16-bit capacity!! Not writing arbitrary parts!!! Set thresholds to prevent this!" << std::endl;
313 uint16_t c = (uint16_t) s;
314 out->write((
char*) &c ,
sizeof(uint16_t));
315 for (
unsigned int i = 0; i < s; i++) {
316 FeatureType f = data[i];
317 out->write((
char*) &f,
sizeof(FeatureType));
321 typedef typename std::vector<FeatureType>::iterator
iterator;
324 size_t size()
const {
return data.size(); }
332 FeatureType
get(
int index) {
346 data.shrink_to_fit();
351 template<
class FeatureType>
356 std::vector<PatternFeatureVector<FeatureType> *>
data;
364 for (const_iterator iter = ref.begin(); iter != ref.end(); iter++) {
368 this->data.push_back(pfv);
386 for (const_iterator iter = this->
begin(); iter != this->
end(); iter++) {
397 for (iterator iter = this->
begin(); iter != this->
end(); iter++) {
406 unsigned int count()
const {
return data.size(); }
414 if (found != this->
end()) {
419 this->data.push_back(pfv);
422 this->data.push_back(pfv);
431 if (found != this->
end()) {
436 this->data.push_back(pfv);
439 this->data.push_back(pfv);
443 size_t size()
const {
return data.size(); }
447 std::cerr <<
"ERROR: PatternFeatureVector does not support serialisation to string" << std::endl;
451 iterator
begin() {
return data.begin(); }
452 const_iterator
begin()
const {
return data.begin(); }
454 iterator
end() {
return data.end(); }
455 const_iterator
end()
const {
return data.end(); }
458 iterator iter = this->
find(pattern);
459 if (iter != this->
end()) {
470 data.shrink_to_fit();
476 template<
class FeatureType>
479 virtual std::string
id() {
return "PatternFeatureVectorMapHandler"; }
482 in->read((
char*) &c,
sizeof(uint16_t));
484 for (
unsigned int i = 0; i < c; i++) {
492 unsigned int s = value.
size();
494 std::cerr <<
"ERROR: PatternFeatureVector size exceeds maximum 16-bit capacity!! Not writing arbitrary parts!!! Set thresholds to prevent this!" << std::endl;
497 const uint16_t c = (uint16_t) s;
498 out->write((
char*) &c,
sizeof(uint16_t));
508 std::cerr <<
"ERROR: PatternFeatureVectorMapHandler does not support serialisation to string (no classdecoder at this point)" << std::endl;
515 std::cerr <<
"ERROR: PatternFeatureVectorMapHandler does not support insertion of index references, model can not be computed with train()" << std::endl;
bool operator!=(const IndexReference &other) const
Definition: datatypes.h:69
PatternFeatureVector(const Pattern &ref, const std::vector< FeatureType > &dataref)
Definition: datatypes.h:279
virtual std::string id()
Definition: datatypes.h:170
void write(std::ostream *out, ValueType &value)
Definition: datatypes.h:192
virtual unsigned int count(ValueType &value) const =0
const_iterator begin() const
Definition: datatypes.h:113
PatternFeatureVector(const PatternFeatureVector &ref)
Definition: datatypes.h:285
void write(std::ostream *out) const
IndexReference(const IndexReference &other)
Definition: datatypes.h:48
void convertto(ValueType *source, IndexedData *&target) const
Definition: datatypes.h:207
const_iterator end() const
Definition: datatypes.h:116
Definition: datatypes.h:477
static const bool indexed
Definition: datatypes.h:188
unsigned int count() const
Definition: datatypes.h:406
This templated class can be used for all numeric base types (such as int, uint16_t, float, etc).
Definition: datatypes.h:185
virtual std::string id()
Definition: datatypes.h:221
iterator find(const IndexReference &ref)
Definition: datatypes.h:118
PatternFeatureVector< FeatureType >::iterator begin()
Definition: datatypes.h:326
void push_back(FeatureType &f)
Definition: datatypes.h:339
PatternFeatureVector(const Pattern &ref)
Definition: datatypes.h:277
void reserve(size_t size)
Definition: datatypes.h:466
PatternFeatureVector< FeatureType >::const_iterator begin() const
Definition: datatypes.h:327
std::vector< IndexReference >::iterator iterator
Definition: datatypes.h:109
Classes for data types and handlers for those data types.
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
friend std::ostream & operator<<(std::ostream &out, const IndexReference &iref)
Definition: datatypes.h:76
bool operator>(const IndexReference &other) const
Definition: datatypes.h:65
void reserve(size_t size)
Definition: datatypes.h:342
std::vector< PatternFeatureVector< FeatureType > * >::iterator iterator
Definition: datatypes.h:359
std::vector< PatternFeatureVector< FeatureType > * >::const_iterator const_iterator
Definition: datatypes.h:358
Abstract value handler class, all value handlers are derived from this. Value handlers are interfaces...
Definition: datatypes.h:168
virtual std::string id()
Definition: datatypes.h:479
void add(PatternFeatureVectorMap< FeatureType > *value, const IndexReference &ref) const
Definition: datatypes.h:514
const_iterator end() const
Definition: datatypes.h:455
bool has(const IndexReference &ref, bool sorted=false) const
Definition: datatypes.h:93
void reserve(size_t size)
Definition: datatypes.h:147
virtual void read(std::istream *in, ValueType &value)=0
void write(std::ostream *out, IndexedData &value)
Definition: datatypes.h:232
void read(std::istream *in, ValueType &v)
Definition: datatypes.h:189
IndexReference(std::istream *in)
Definition: datatypes.h:44
void write(std::ostream *out, PatternFeatureVectorMap< FeatureType > &value)
Definition: datatypes.h:491
PatternFeatureVector()
Definition: datatypes.h:275
IndexReference()
Definition: datatypes.h:37
virtual void convertto(ValueType *source, ValueType *&target) const
Definition: datatypes.h:177
const_iterator find(const IndexReference &ref) const
Definition: datatypes.h:119
std::vector< PatternFeatureVector< FeatureType > * > data
Definition: datatypes.h:356
Reference to a position in the corpus.
Definition: datatypes.h:33
iterator find(const Pattern &ref)
Definition: datatypes.h:396
unsigned int count(IndexedData &value) const
Definition: datatypes.h:248
void sort()
Definition: datatypes.h:143
IndexedData()
Definition: datatypes.h:89
virtual std::string tostring(PatternFeatureVectorMap< FeatureType > &value)
Definition: datatypes.h:507
const_iterator begin() const
Definition: datatypes.h:452
unsigned int count(PatternFeatureVectorMap< FeatureType > &value) const
Definition: datatypes.h:511
unsigned int count() const
Definition: datatypes.h:104
void convertto(PatternFeatureVectorMap< FeatureType > *value, unsigned int *&convertedvalue) const
Definition: datatypes.h:520
iterator end()
Definition: datatypes.h:115
virtual std::string tostring(ValueType &value)
Definition: datatypes.h:195
Pattern pattern
Definition: datatypes.h:272
virtual ~PatternFeatureVector()
Definition: datatypes.h:276
void write(std::ostream *out) const
Definition: datatypes.h:52
void convertto(IndexedData *source, IndexedData *&target) const
Definition: datatypes.h:258
virtual void write(std::ostream *out, ValueType &value)=0
void read(std::istream *in, IndexedData &v)
Definition: datatypes.h:222
void insert(PatternFeatureVector< FeatureType > *pfv, bool checkexists=true)
Definition: datatypes.h:410
std::vector< FeatureType >::iterator iterator
Definition: datatypes.h:321
virtual std::string id()
Definition: datatypes.h:187
size_t size() const
Definition: datatypes.h:107
void shrink_to_fit()
Definition: datatypes.h:469
void convertto(ValueType *source, ValueType *&target) const
Definition: datatypes.h:205
std::set< int > sentences() const
Definition: datatypes.h:124
void read(std::istream *in, PatternFeatureVectorMap< FeatureType > &v)
Definition: datatypes.h:480
uint16_t token
Definition: datatypes.h:36
std::vector< FeatureType > data
Definition: datatypes.h:273
static const bool indexed
Definition: datatypes.h:220
void read(std::istream *in)
Definition: datatypes.h:294
void insert(IndexReference ref)
Definition: datatypes.h:106
void add(IndexedData *value, const IndexReference &ref) const
Definition: datatypes.h:251
void add(ValueType *value, const IndexReference &ref) const
Definition: datatypes.h:201
Collection of references to position in the corpus (IndexReference). Used by Indexed Pattern models...
Definition: datatypes.h:86
Data handler for IndexedData. Deals with serialisation from/to file and conversions.
Definition: datatypes.h:218
void insert(PatternFeatureVector< FeatureType > &value, bool checkexists=true)
Definition: datatypes.h:426
size_t size() const
Definition: datatypes.h:443
std::vector< FeatureType >::const_iterator const_iterator
Definition: datatypes.h:322
void write(std::ostream *out, const unsigned char *corpusstart=NULL) const
Definition: pattern.cpp:228
PatternFeatureVector(std::istream *in)
Definition: datatypes.h:289
iterator end()
Definition: datatypes.h:454
size_t size() const
Definition: datatypes.h:324
std::vector< IndexReference >::const_iterator const_iterator
Definition: datatypes.h:110
std::set< IndexReference > set() const
Definition: datatypes.h:136
void convertto(PatternFeatureVectorMap< FeatureType > *source, IndexedData *&target) const
Definition: datatypes.h:519
bool has(const Pattern &ref) const
Definition: datatypes.h:385
Basic largely trivial functions for the common good.
std::vector< IndexReference > data
Definition: datatypes.h:88
virtual std::string tostring(ValueType &value)=0
IndexReference(uint32_t sentence, uint16_t token)
Definition: datatypes.h:42
void convertto(IndexedData *value, unsigned int *&convertedvalue) const
Definition: datatypes.h:259
PatternFeatureVector< FeatureType >::iterator end()
Definition: datatypes.h:329
bool operator==(const IndexReference &other) const
Definition: datatypes.h:68
void clear()
Definition: datatypes.h:336
Definition: datatypes.h:270
void shrink_to_fit()
Definition: datatypes.h:345
iterator begin()
Definition: datatypes.h:451
virtual std::string tostring()
Definition: datatypes.h:445
unsigned int count(ValueType &value) const
Definition: datatypes.h:198
Contains the Pattern class that is ubiquitous throughout Colibri Core.
Class for decoding binary class-encoded data back to plain-text.
iterator begin()
Definition: datatypes.h:112
IndexReference operator+(const int other) const
Definition: datatypes.h:70
void shrink_to_fit()
Definition: datatypes.h:150
uint32_t sentence
Definition: datatypes.h:35
Definition: datatypes.h:352
bool operator<(const IndexReference &other) const
Definition: datatypes.h:56
void convertto(PatternFeatureVectorMap< FeatureType > *source, PatternFeatureVectorMap< FeatureType > *&target) const
Definition: datatypes.h:518
PatternFeatureVector< FeatureType >::const_iterator end() const
Definition: datatypes.h:330
virtual PatternFeatureVector< FeatureType > * getdata(const Pattern &pattern)
Definition: datatypes.h:457
virtual void add(ValueType *value, const IndexReference &ref) const =0
std::string tostring() const
Definition: datatypes.h:72
void write(std::ostream *out)
Definition: datatypes.h:306
virtual std::string tostring(IndexedData &value)
Definition: datatypes.h:240