1 #ifndef COLIBRIPATTERNSTORE_H
2 #define COLIBRIPATTERNSTORE_H
8 #include <unordered_map>
13 #include <unordered_set>
56 patternpointer = NULL;
81 if (corpus != NULL)
delete[]
corpus;
89 void load(std::istream *in,
bool debug =
false);
95 void load(std::string filename,
bool debug =
false);
136 std::vector<IndexReference>
findpattern(
const Pattern & pattern, uint32_t sentence = 0,
int maxmatches=0);
150 unsigned int sentences()
const {
return sentenceindex.size(); }
168 pairpointer =
new std::pair<IndexReference,PatternPointer>(*ptr);
171 pairpointer =
new std::pair<IndexReference,PatternPointer>(iref, pp);
174 pairpointer =
new std::pair<IndexReference,PatternPointer>(ref.first, ref.second);
199 self_type
operator++(
int junk) { self_type tmpiter = *
this;
next();
return *tmpiter; }
263 totaltokens = patternpointer->
n();
270 bool empty()
const {
return (corpussize <= 1); }
312 virtual size_t size()
const =0;
323 template<
class ContainerType,
class ReadWriteSizeType = u
int64_t,
class PatternType = Pattern>
334 virtual void attachcorpus(
unsigned char * corpusstart,
unsigned int corpussize) {
340 this->corpussize = corpus.
bytesize();
343 this->corpusstart = NULL;
344 this->corpussize = 0;
366 virtual size_t size()
const =0;
367 virtual void reserve(
size_t) =0;
373 virtual typename ContainerType::iterator
begin()=0;
374 virtual typename ContainerType::iterator
end()=0;
375 virtual typename ContainerType::iterator
find(
const Pattern & pattern)=0;
378 virtual void write(std::ostream * out)=0;
397 template<
class ContainerType,
class ValueType,
class ValueHandler,
class ReadWriteSizeType = u
int32_t,
class PatternType=Pattern>
414 virtual size_t size()
const =0;
415 virtual void reserve(
size_t) =0;
424 virtual typename ContainerType::iterator
begin()=0;
425 virtual typename ContainerType::iterator
end()=0;
426 virtual typename ContainerType::iterator
find(
const Pattern & pattern)=0;
432 virtual void write(std::ostream * out) {
433 ReadWriteSizeType s = (ReadWriteSizeType)
size();
434 out->write( (
char*) &s,
sizeof(ReadWriteSizeType));
435 for (iterator iter = this->
begin(); iter != this->
end(); iter++) {
438 this->valuehandler.write(out, iter->second);
445 virtual void write(std::string filename) {
446 std::ofstream * out =
new std::ofstream(filename.c_str());
456 template<
class ReadValueType=ValueType,
class ReadValueHandler=ValueHandler,
class ReadPatternType=PatternType>
457 void read(std::istream * in,
int MINTOKENS=0,
int MINLENGTH=0,
int MAXLENGTH=999999,
PatternStoreInterface * constrainstore = NULL,
bool DONGRAMS=
true,
bool DOSKIPGRAMS=
true,
bool DOFLEXGRAMS=
true,
bool DORESET=
false,
bool DEBUG=
false) {
458 ReadValueHandler readvaluehandler = ReadValueHandler();
461 in->read( (
char*) &s,
sizeof(ReadWriteSizeType));
463 if (DEBUG) std::cerr <<
"Reading " << s <<
" patterns, classencodingversion=" << (int) this->
classencodingversion <<
", @corpusstart=" << (
size_t) this->
corpusstart << std::endl;
464 if (MINTOKENS == -1) MINTOKENS = 0;
465 for (ReadWriteSizeType i = 0; i < s; i++) {
468 }
catch (std::exception &e) {
469 std::cerr <<
"ERROR: Exception occurred at pattern " << (i+1) <<
" of " << s << std::endl;
472 if (!DONGRAMS || !DOSKIPGRAMS || !DOFLEXGRAMS) {
474 if ((!DONGRAMS && c ==
NGRAM) || (!DOSKIPGRAMS && c ==
SKIPGRAM) || (!DOFLEXGRAMS && c ==
FLEXGRAM))
continue;
476 const int n = p.size();
477 if (DEBUG) std::cerr <<
"Read pattern #" << (i+1) <<
", size=" << n <<
", valuehandler=" << readvaluehandler.id() <<
", classencodingversion="<<(int)this->
classencodingversion;
478 ReadValueType readvalue;
479 readvaluehandler.read(in, readvalue);
480 if (n >= MINLENGTH && n <= MAXLENGTH) {
481 if ((readvaluehandler.count(readvalue) >= (
unsigned int) MINTOKENS) && ((constrainstore == NULL) || (constrainstore->has(p)))) {
482 ValueType * convertedvalue = NULL;
484 convertedvalue =
new ValueType();
486 readvaluehandler.convertto(&readvalue, convertedvalue);
487 if (DEBUG) std::cerr <<
"...converted";
488 if (convertedvalue == NULL) {
489 if (DEBUG) std::cerr << std::endl;
490 std::cerr <<
"ERROR: Converted value yielded NULL at pattern #" << (i+1) <<
", size=" << n <<
", valuehandler=" << readvaluehandler.id() <<std::endl;
494 if (DEBUG) std::cerr <<
"...adding";
495 this->
insert(p,*convertedvalue);
496 if ((convertedvalue != NULL) && ((
void*) convertedvalue != (
void*) &readvalue))
delete convertedvalue;
498 if (readvaluehandler.count(readvalue) < (
unsigned int) MINTOKENS) {
499 std::cerr <<
"...skipping because of occurrence (" << readvaluehandler.count(readvalue) <<
" below " << MINTOKENS;
501 std::cerr <<
"...skipping because of constraints";
505 std::cerr <<
"...skipping because of length";
507 if (DEBUG) std::cerr << std::endl;
514 void read(std::string filename,
int MINTOKENS=0,
int MINLENGTH=0,
int MAXLENGTH=999999,
PatternStoreInterface * constrainstore = NULL,
bool DONGRAMS=
true,
bool DOSKIPGRAMS=
true,
bool DOFLEXGRAMS=
true,
bool DORESET =
false,
bool DEBUG=
false) {
515 std::ifstream * in =
new std::ifstream(filename.c_str());
516 this->read<ValueType,ValueHandler>(in,MINTOKENS,MINLENGTH,MAXLENGTH,constrainstore,DONGRAMS,DOSKIPGRAMS,DOFLEXGRAMS, DORESET, DEBUG);
537 template<
class ReadWriteSizeType = u
int32_t>
554 const int cls = iter->first;
556 unsigned char * buffer =
inttobytes(cls, length);
557 data.insert(
Pattern(buffer, length) );
567 const int cls = iter->second;
569 unsigned char * buffer =
inttobytes(cls, length);
570 data.insert(
Pattern(buffer, length) );
581 data.insert(pattern);
587 bool has(
const Pattern & pattern)
const {
return data.count(pattern); }
597 size_t size()
const {
return data.size(); }
606 iterator
begin() {
return data.begin(); }
607 const_iterator
begin()
const {
return data.begin(); }
609 iterator
end() {
return data.end(); }
610 const_iterator
end()
const {
return data.end(); }
616 iterator
find(
const Pattern & pattern) {
return data.find(pattern); }
617 const_iterator
find(
const Pattern & pattern)
const {
return data.find(pattern); }
626 iterator
erase(const_iterator position) {
return data.erase(position); }
633 ReadWriteSizeType s = (ReadWriteSizeType)
size();
634 out->write( (
char*) &s,
sizeof(ReadWriteSizeType));
635 for (iterator iter =
begin(); iter !=
end(); iter++) {
644 void read(std::istream * in,
int MINLENGTH=0,
int MAXLENGTH=999999,
PatternStoreInterface * constrainstore = NULL,
bool DONGRAMS=
true,
bool DOSKIPGRAMS=
true,
bool DOFLEXGRAMS=
true) {
646 in->read( (
char*) &s,
sizeof(ReadWriteSizeType));
648 for (
unsigned int i = 0; i < s; i++) {
650 if (!DONGRAMS || !DOSKIPGRAMS || !DOFLEXGRAMS) {
652 if ((!DONGRAMS && c ==
NGRAM) || (!DOSKIPGRAMS && c ==
SKIPGRAM) || (!DOFLEXGRAMS && c ==
FLEXGRAM))
continue;
654 const int n = p.
size();
655 if ((n >= MINLENGTH && n <= MAXLENGTH) && ((constrainstore == NULL) || (constrainstore->has(p)))) {
665 template<
class ReadValueType,
class ReadValueHandler=BaseValueHandler<ReadValueType>>
666 void readmap(std::istream * in,
int MINTOKENS=0,
int MINLENGTH=0,
int MAXLENGTH=999999,
PatternStoreInterface * constrainstore = NULL,
bool DONGRAMS=
true,
bool DOSKIPGRAMS=
true,
bool DOFLEXGRAMS=
true) {
667 ReadValueHandler readvaluehandler = ReadValueHandler();
669 in->read( (
char*) &s,
sizeof(ReadWriteSizeType));
672 for (ReadWriteSizeType i = 0; i < s; i++) {
676 }
catch (std::exception &e) {
677 std::cerr <<
"ERROR: Exception occurred at pattern " << (i+1) <<
" of " << s << std::endl;
680 if (!DONGRAMS || !DOSKIPGRAMS || !DOFLEXGRAMS) {
682 if ((!DONGRAMS && c ==
NGRAM) || (!DOSKIPGRAMS && c ==
SKIPGRAM) || (!DOFLEXGRAMS && c ==
FLEXGRAM))
continue;
684 const int n = p.
size();
685 ReadValueType readvalue;
687 readvaluehandler.read(in, readvalue);
688 if (n >= MINLENGTH && n <= MAXLENGTH) {
689 if ((readvaluehandler.count(readvalue) >= (
unsigned int) MINTOKENS) && ((constrainstore == NULL) || (constrainstore->has(p)))) {
708 template<
class ReadWriteSizeType = u
int64_t>
718 data.insert(pattern);
721 bool has(
const Pattern & pattern)
const {
return data.count(pattern); }
723 size_t size()
const {
return data.size(); }
726 typedef t_hashorderedpatternset::iterator
iterator;
729 iterator
begin() {
return data.begin(); }
730 const_iterator
begin()
const {
return data.begin(); }
732 iterator
end() {
return data.end(); }
733 const_iterator
end()
const {
return data.end(); }
735 iterator
find(
const Pattern & pattern) {
return data.find(pattern); }
736 const_iterator
find(
const Pattern & pattern)
const {
return data.find(pattern); }
741 iterator
erase(const_iterator position) {
return data.erase(position); }
745 ReadWriteSizeType s = (ReadWriteSizeType)
size();
746 out->write( (
char*) &s,
sizeof(ReadWriteSizeType));
747 for (iterator iter =
begin(); iter !=
end(); iter++) {
753 void read(std::istream * in,
int MINLENGTH=0,
int MAXLENGTH=999999,
bool DONGRAMS=
true,
bool DOSKIPGRAMS=
true,
bool DOFLEXGRAMS=
true) {
755 in->read( (
char*) &s,
sizeof(ReadWriteSizeType));
757 for (
unsigned int i = 0; i < s; i++) {
759 if (!DONGRAMS || !DOSKIPGRAMS || !DOFLEXGRAMS) {
761 if ((!DONGRAMS && c ==
NGRAM) || (!DOSKIPGRAMS && c ==
SKIPGRAM) || (!DOFLEXGRAMS && c ==
FLEXGRAM))
continue;
763 const int n = p.
size();
764 if (n >= MINLENGTH && n <= MAXLENGTH) {
781 template<
class ValueType,
class ValueHandler = BaseValueHandler<ValueType>,
class ReadWriteSizeType = u
int64_t>
784 std::unordered_map<Pattern, ValueType>
data;
790 data[pattern] = value;
796 return data.count(pattern);
800 size_t size()
const {
return data.size(); }
807 typedef typename std::unordered_map<Pattern,ValueType>::iterator
iterator;
808 typedef typename std::unordered_map<Pattern,ValueType>::const_iterator
const_iterator;
810 iterator
begin() {
return data.begin(); }
811 const_iterator
begin()
const {
return data.begin(); }
813 iterator
end() {
return data.end(); }
814 const_iterator
end()
const {
return data.end(); }
816 iterator
find(
const Pattern & pattern) {
return data.find(pattern); }
817 const_iterator
find(
const Pattern & pattern)
const {
return data.find(pattern); }
822 iterator
erase(const_iterator position) {
return data.erase(position); }
827 template<
class ValueType,
class ValueHandler = BaseValueHandler<ValueType>,
class ReadWriteSizeType = u
int64_t>
830 std::unordered_map<PatternPointer, ValueType>
data;
842 data[pattern] = value;
848 return data.count(pattern);
852 size_t size()
const {
return data.size(); }
859 typedef typename std::unordered_map<PatternPointer,ValueType>::iterator
iterator;
860 typedef typename std::unordered_map<PatternPointer,ValueType>::const_iterator
const_iterator;
862 iterator
begin() {
return data.begin(); }
863 const_iterator
begin()
const {
return data.begin(); }
865 iterator
end() {
return data.end(); }
866 const_iterator
end()
const {
return data.end(); }
870 return data.find(pp);
878 iterator
erase(const_iterator position) {
return data.erase(position); }
881 template<
class ValueType,
class ValueHandler = BaseValueHandler<ValueType>,
class ReadWriteSizeType = u
int64_t>
884 std::map<PatternPointer, ValueType>
data;
896 data[pattern] = value;
902 return data.count(pattern);
906 size_t size()
const {
return data.size(); }
913 typedef typename std::map<PatternPointer,ValueType>::iterator
iterator;
914 typedef typename std::map<PatternPointer,ValueType>::const_iterator
const_iterator;
916 iterator
begin() {
return data.begin(); }
917 const_iterator
begin()
const {
return data.begin(); }
919 iterator
end() {
return data.end(); }
920 const_iterator
end()
const {
return data.end(); }
924 return data.find(pp);
932 iterator
erase(const_iterator position) {
return data.erase(position); }
935 template<
class ValueType,
class ValueHandler = BaseValueHandler<ValueType>,
class ReadWriteSizeType = u
int64_t>
938 std::map<const Pattern, ValueType>
data;
944 data[pattern] = value;
949 bool has(
const Pattern & pattern)
const {
return data.count(pattern); }
952 size_t size()
const {
return data.size(); }
958 typedef typename std::map<const Pattern,ValueType>::iterator
iterator;
959 typedef typename std::map<const Pattern,ValueType>::const_iterator
const_iterator;
961 iterator
begin() {
return data.begin(); }
962 const_iterator
begin()
const {
return data.begin(); }
964 iterator
end() {
return data.end(); }
965 const_iterator
end()
const {
return data.end(); }
967 iterator
find(
const Pattern & pattern) {
return data.find(pattern); }
968 const_iterator
find(
const Pattern & pattern)
const {
return data.find(pattern); }
973 iterator
erase(const_iterator position) {
return data.erase(position); }
979 template<
class T,
size_t N,
int countindex = 0>
983 virtual std::string
id() {
return "ArrayValueHandler"; }
984 void read(std::istream * in, std::array<T,N> & a) {
985 for (
int i = 0; i < N; i++) {
987 in->read( (
char*) &v,
sizeof(T));
991 void write(std::ostream * out, std::array<T,N> & a) {
992 for (
int i = 0; i < N; i++) {
994 out->write( (
char*) &v,
sizeof(T));
999 for (
int i = 0; i < N; i++) {
1001 if (!s.empty()) s +=
" ";
1006 unsigned int count(std::array<T,N> & a)
const {
1007 return (
int) a[countindex];
1010 (*value)[countindex] += 1;
1018 template<
class PatternStoreType>
1022 virtual std::string
id() {
return "PatternStoreValueHandler"; }
1023 void read(std::istream * in, PatternStoreType & value) {
1026 void write(std::ostream * out, PatternStoreType & value) {
1029 virtual std::string
tostring( PatternStoreType & value) {
1030 std::cerr <<
"PatternStoreValueHandler::tostring() is not supported" << std::endl;
1033 unsigned int count( PatternStoreType & value)
const {
1034 return value.size();
1037 std::cerr <<
"PatternStoreValueHandler::add() is not supported" << std::endl;
1046 template<
class ValueType,
class ValueHandler=BaseValueHandler<ValueType>,
class NestedSizeType = u
int16_t >
1047 class AlignedPatternMap:
public PatternMap< PatternMap<ValueType,ValueHandler,NestedSizeType>,PatternStoreValueHandler<PatternMap<ValueType,ValueHandler,NestedSizeType>>, uint64_t > {
1051 typedef typename PatternMap< PatternMap<ValueType,ValueHandler,NestedSizeType>,PatternStoreValueHandler<PatternMap<ValueType,ValueHandler,NestedSizeType>>, uint64_t >
::const_iterator const_iterator;
iterator erase(const_iterator position)
Definition: patternstore.h:741
unsigned char * corpusstart
Definition: patternstore.h:326
const_iterator find(const Pattern &pattern) const
Definition: patternstore.h:617
std::unordered_map< Pattern, ValueType > data
Definition: patternstore.h:784
std::unordered_map< std::string, unsigned int >::const_iterator const_iterator
Definition: classencoder.h:222
iterator find(const Pattern &pattern)
Definition: patternstore.h:735
ContainerType::const_iterator const_iterator
Definition: patternstore.h:422
bool erase(const PatternPointer &pattern)
Definition: patternstore.h:877
t_patternset::const_iterator const_iterator
Definition: patternstore.h:601
void insert(const PatternPointer &pattern)
Definition: patternstore.h:899
virtual void attachcorpus(const IndexedCorpus &corpus)
Definition: patternstore.h:338
iterator self_type
Definition: patternstore.h:158
PatternMap< PatternMap< ValueType, ValueHandler, NestedSizeType >, PatternStoreValueHandler< PatternMap< ValueType, ValueHandler, NestedSizeType > >, uint64_t >::iterator iterator
Definition: patternstore.h:1050
size_t size() const
Definition: patternstore.h:597
void reserve(size_t s)
Definition: patternstore.h:953
bool erase(const Pattern &pattern)
Definition: patternstore.h:625
virtual bool has(const Pattern &) const =0
void read(std::istream *in, int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true, bool DORESET=false, bool DEBUG=false)
Definition: patternstore.h:457
bool erase(const Pattern &pattern)
Definition: patternstore.h:821
const_iterator begin() const
Definition: patternstore.h:917
unsigned char * corpus
Definition: patternstore.h:46
size_t size() const
Definition: patternstore.h:952
iterator(const self_type &ref)
Definition: patternstore.h:164
unsigned int count(std::array< T, N > &a) const
Definition: patternstore.h:1006
unsigned int inttobytes(unsigned char *buffer, unsigned int cls)
Definition: classencoder.cpp:22
void write(std::ostream *out)
Definition: patternstore.h:744
virtual PatternStoreInterface * getstoreinterface()
Definition: patternstore.h:381
bool has(const Pattern &pattern) const
Definition: patternstore.h:949
unsigned int corpussize
Definition: patternstore.h:47
iterator find(const Pattern &pattern)
Definition: patternstore.h:816
bool empty() const
Definition: patternstore.h:270
void reserve(size_t s)
Definition: patternstore.h:907
iterator find(const Pattern &pattern)
Definition: patternstore.h:967
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:950
ValueType & operator[](const Pattern &pattern)
Definition: patternstore.h:910
virtual void reserve(size_t)=0
const_iterator find(const Pattern &pattern) const
Definition: patternstore.h:968
std::vector< IndexReference > findpattern(const Pattern &pattern, uint32_t sentence=0, int maxmatches=0)
Definition: pattern.cpp:1787
void write(std::ostream *out, PatternStoreType &value)
Definition: patternstore.h:1026
ValueType & operator[](const Pattern &pattern)
Definition: patternstore.h:804
void insert(const Pattern &pattern)
Definition: patternstore.h:947
Definition: pattern.h:357
std::string tostring(std::array< T, N > &a)
Definition: patternstore.h:997
virtual ContainerType::iterator end()=0
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:798
iterator find(const Pattern &pattern)
Definition: patternstore.h:922
void add(std::array< T, N > *value, const IndexReference &ref) const
Definition: patternstore.h:1009
pointer operator->()
Definition: patternstore.h:201
Classes for data types and handlers for those data types.
unsigned int count(PatternStoreType &value) const
Definition: patternstore.h:1033
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
iterator find(const IndexReference &ref)
Definition: patternstore.h:235
std::map< const Pattern, ValueType > data
Definition: patternstore.h:938
unsigned int sentences() const
Definition: patternstore.h:150
iterator begin()
Definition: patternstore.h:961
Definition: patternstore.h:156
void insert(const PatternPointer &pattern, ValueType &value)
Definition: patternstore.h:841
iterator end()
Definition: patternstore.h:813
bool operator!=(self_type rhs)
Definition: patternstore.h:203
iterator end()
Definition: patternstore.h:919
virtual size_t size() const =0
void add(PatternStoreType *value, const IndexReference &ref) const
Definition: patternstore.h:1036
size_t size() const
Definition: patternstore.h:723
std::map< PatternPointer, ValueType >::iterator iterator
Definition: patternstore.h:913
void reserve(size_t s)
Definition: patternstore.h:801
const size_t n() const
Definition: pattern.cpp:93
virtual bool has(const Pattern &) const =0
const_iterator end() const
Definition: patternstore.h:920
ValueType & operator[](const Pattern &pattern)
Definition: patternstore.h:955
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:722
void insert(const Pattern &pattern, ValueType &value)
Definition: patternstore.h:943
Abstract value handler class, all value handlers are derived from this. Value handlers are interfaces...
Definition: datatypes.h:168
bool operator==(self_type rhs)
Definition: patternstore.h:202
void reserve(size_t s)
Definition: patternstore.h:598
std::unordered_map< Pattern, ValueType >::iterator iterator
Definition: patternstore.h:807
PatternCategory
Definition: pattern.h:52
void reserve(size_t s)
Definition: patternstore.h:853
unsigned int totaltokens
Definition: patternstore.h:49
PatternMap< ValueType, ValueHandler, NestedSizeType > valuetype
Definition: patternstore.h:1049
ContainerType::iterator iterator
Definition: patternstore.h:421
t_hashorderedpatternset::iterator iterator
Definition: patternstore.h:726
iterator begin()
Definition: patternstore.h:916
void next()
Definition: patternstore.h:187
int patterntype
Definition: patternstore.h:329
std::unordered_map< unsigned int, std::string >::const_iterator const_iterator
Definition: classdecoder.h:137
Limited virtual interface to pattern stores.
Definition: interface.h:20
unsigned char * getpointer(const IndexReference &begin) const
Definition: pattern.cpp:1735
std::unordered_map< Pattern, ValueType >::const_iterator const_iterator
Definition: patternstore.h:808
bool erase(const Pattern &pattern)
Definition: patternstore.h:740
void read(std::string filename, int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true, bool DORESET=false, bool DEBUG=false)
Definition: patternstore.h:514
size_t size() const
Definition: patternstore.h:906
std::unordered_map< PatternPointer, ValueType >::const_iterator const_iterator
Definition: patternstore.h:860
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:819
bool has(const Pattern &pattern) const
Definition: patternstore.h:721
Class for reading an entire (class encoded) corpus into memory. It provides a reverse index by IndexR...
Definition: patternstore.h:44
void load(std::istream *in, bool debug=false)
Definition: pattern.cpp:1687
std::map< PatternPointer, ValueType > data
Definition: patternstore.h:884
IndexedCorpus(unsigned char *corpus, unsigned int corpussize)
Definition: patternstore.h:62
bool erase(const Pattern &pattern)
Definition: patternstore.h:972
virtual ContainerType::iterator find(const Pattern &pattern)=0
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:620
const PatternCategory category() const
Definition: pattern.cpp:42
std::unordered_map< PatternPointer, ValueType >::iterator iterator
Definition: patternstore.h:859
std::forward_iterator_tag iterator_category
Definition: patternstore.h:162
virtual std::string id()
Definition: patternstore.h:983
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping...
Definition: classdecoder.h:43
std::map< const Pattern, ValueType >::const_iterator const_iterator
Definition: patternstore.h:959
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:592
Reference to a position in the corpus.
Definition: datatypes.h:33
void insert(const PatternPointer &pattern, ValueType &value)
Definition: patternstore.h:895
void insert(const Pattern pattern)
Definition: patternstore.h:717
t_hashorderedpatternset data
Definition: patternstore.h:711
IndexedCorpus()
Definition: patternstore.h:52
iterator erase(const_iterator position)
Definition: patternstore.h:878
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:738
A pattern map storing patterns and their values in a hash map (unordered_map).
Definition: patternstore.h:782
PatternPointer * patternpointer
Definition: patternstore.h:48
void insert(const Pattern &pattern, ValueType &value)
Definition: patternstore.h:789
void read(std::istream *in, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:644
IndexPattern & reference
Definition: patternstore.h:160
std::unordered_set< Pattern > t_patternset
Definition: patternstore.h:528
unsigned char classencodingversion
Definition: patternstore.h:328
const_iterator end() const
Definition: patternstore.h:866
iterator end()
Definition: patternstore.h:609
self_type operator++(int junk)
Definition: patternstore.h:199
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:818
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:850
static const bool indexed
Definition: patternstore.h:1021
virtual ContainerType::iterator begin()=0
unsigned int bytestoint(const unsigned char *a, unsigned int *length)
Definition: classdecoder.cpp:20
const_iterator end() const
Definition: patternstore.h:610
~IndexedCorpus()
Definition: patternstore.h:80
int difference_type
Definition: patternstore.h:163
void write(std::ostream *out)
Definition: patternstore.h:632
bool has(const IndexReference &ref) const
Definition: patternstore.h:254
iterator begin()
Definition: patternstore.h:862
PatternPointer getpointer() const
Definition: pattern.cpp:1835
iterator end()
Definition: patternstore.h:865
A complex value handler for values that are themselves pattern stores (allows building nested maps)...
Definition: patternstore.h:1019
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:929
const_iterator end() const
Definition: patternstore.h:965
const_iterator find(const Pattern &pattern) const
Definition: patternstore.h:817
virtual bool erase(const PatternType &)=0
bool has(const Pattern &pattern) const
Definition: patternstore.h:901
virtual void reserve(size_t)=0
virtual ValueType & operator[](const Pattern &pattern)=0
const size_t size() const
Definition: pattern.h:156
void insert(const PatternPointer &pattern)
Definition: patternstore.h:845
virtual void write(std::ostream *out)=0
const_iterator begin() const
Definition: patternstore.h:730
unsigned int getcorpussize() const
Definition: patternstore.h:349
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:619
virtual void detachcorpus()
Definition: patternstore.h:342
virtual void insert(const PatternType &pattern)=0
Definition: patternstore.h:882
iterator end()
Definition: patternstore.h:224
Definition: patternstore.h:980
IndexedCorpus * corpus
Definition: patternstore.h:886
static const bool indexed
Definition: patternstore.h:982
const_iterator begin() const
Definition: patternstore.h:863
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:737
virtual void write(std::string filename)
Definition: patternstore.h:445
iterator erase(const_iterator position)
Definition: patternstore.h:626
IndexPattern * pointer
Definition: patternstore.h:161
std::pair< IndexReference, PatternPointer > IndexPattern
Definition: patternstore.h:39
static const unsigned char delimiterclass
Definition: classdecoder.h:48
int sentencelength(int sentence) const
Definition: pattern.cpp:1806
virtual size_t size() const =0
PatternMap< PatternMap< ValueType, ValueHandler, NestedSizeType >, PatternStoreValueHandler< PatternMap< ValueType, ValueHandler, NestedSizeType > >, uint64_t >::const_iterator const_iterator
Definition: patternstore.h:1051
std::map< PatternPointer, ValueType >::const_iterator const_iterator
Definition: patternstore.h:914
void read(std::istream *in, int MINLENGTH=0, int MAXLENGTH=999999, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:753
ContainerType::const_iterator const_iterator
Definition: patternstore.h:371
bool erase(const PatternPointer &pattern)
Definition: patternstore.h:931
size_t size() const
Definition: patternstore.h:800
self_type operator++()
Definition: patternstore.h:182
iterator begin()
Definition: patternstore.h:810
Abstract Pattern store class, not to be instantiated directly.
Definition: patternstore.h:324
void insert(const Pattern &pattern)
Definition: patternstore.h:793
virtual std::string id()
Definition: patternstore.h:1022
ContainerType::iterator iterator
Definition: patternstore.h:370
unsigned int operator[](const IndexReference &ref)
Definition: patternstore.h:279
std::map< const Pattern, ValueType >::iterator iterator
Definition: patternstore.h:958
const_iterator end() const
Definition: patternstore.h:814
iterator begin()
Definition: patternstore.h:729
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:970
std::map< uint32_t, unsigned char * > sentenceindex
Definition: patternstore.h:50
virtual size_t size() const =0
iterator end()
Definition: patternstore.h:964
IndexedCorpus * corpus
Definition: patternstore.h:832
t_patternset data
Definition: patternstore.h:540
void read(std::istream *in, PatternStoreType &value)
Definition: patternstore.h:1023
const_iterator end() const
Definition: patternstore.h:733
unsigned int bytesize() const
Definition: patternstore.h:118
unsigned char * getcorpus() const
Definition: patternstore.h:346
iterator erase(const_iterator position)
Definition: patternstore.h:973
virtual ContainerType::iterator end()=0
virtual ContainerType::iterator begin()=0
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:875
Class for encoding plain-text to binary class-encoded data. The ClassEncoder maintains a mapping of w...
Definition: classencoder.h:50
PatternPointer getpattern() const
Definition: patternstore.h:112
PatternPointer getsentence(int sentence) const
Definition: pattern.cpp:1826
void write(std::ostream *out, const unsigned char *corpusstart=NULL) const
Definition: pattern.cpp:228
bool has(const Pattern &pattern) const
Definition: patternstore.h:847
iterator erase(const_iterator position)
Definition: patternstore.h:822
iterator find(const Pattern &pattern)
Definition: patternstore.h:616
virtual void insert(const PatternType &pattern, ValueType &value)=0
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:874
iterator begin()
Definition: patternstore.h:214
iterator(pointer ptr)
Definition: patternstore.h:167
const_iterator find(const Pattern &pattern) const
Definition: patternstore.h:736
unsigned char * data
Definition: pattern.h:360
size_t size()
Definition: patternstore.h:261
Definition: patternstore.h:936
Class for encoding plain-text to binary class-encoded data.
A nested pattern map, useful for storing patterns that map to other patterns, which in turn map to va...
Definition: patternstore.h:1047
virtual void write(std::ostream *out)
Definition: patternstore.h:432
iterator end()
Definition: patternstore.h:732
iterator begin()
Definition: patternstore.h:606
void debug()
Definition: patternstore.h:204
A pattern store in the form of an unordered set (i.e, no duplicates). Stores only patterns...
Definition: patternstore.h:538
virtual ContainerType::iterator find(const Pattern &pattern)=0
void read(std::istream *in, std::array< T, N > &a)
Definition: patternstore.h:984
unsigned char * beginpointer() const
Definition: patternstore.h:115
virtual std::string tostring(PatternStoreType &value)
Definition: patternstore.h:1029
Basic largely trivial functions for the common good.
const_iterator begin() const
Definition: patternstore.h:962
ValueHandler valuehandler
Definition: patternstore.h:400
reference operator*()
Definition: patternstore.h:200
~iterator()
Definition: patternstore.h:179
PatternType
Definition: pattern.h:59
iterator()
Definition: patternstore.h:176
virtual void use_v1_format()
Definition: patternstore.h:357
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:904
std::unordered_map< PatternPointer, ValueType > data
Definition: patternstore.h:830
Abstract class for map-like pattern stores, do not instantiate directly.
Definition: patternstore.h:398
ValueType & operator[](const Pattern &pattern)
Definition: patternstore.h:856
iterator find(const Pattern &pattern)
Definition: patternstore.h:868
void reserve(size_t s)
Definition: patternstore.h:724
Definition: patternstore.h:828
virtual void attachcorpus(unsigned char *corpusstart, unsigned int corpussize)
Definition: patternstore.h:334
t_hashorderedpatternset::const_iterator const_iterator
Definition: patternstore.h:727
iterator(IndexReference iref, PatternPointer pp)
Definition: patternstore.h:170
const_iterator begin() const
Definition: patternstore.h:811
Contains the Pattern class that is ubiquitous throughout Colibri Core.
pointer pairpointer
Definition: patternstore.h:208
void insert(const Pattern &pattern)
Definition: patternstore.h:580
Class for decoding binary class-encoded data back to plain-text.
void write(std::ostream *out, std::array< T, N > &a)
Definition: patternstore.h:991
virtual bool has(const Pattern &) const =0
IndexPattern value_type
Definition: patternstore.h:159
size_t size() const
Definition: patternstore.h:852
bool has(const Pattern &pattern) const
Definition: patternstore.h:587
void readmap(std::istream *in, int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:666
unsigned int corpussize
Definition: patternstore.h:327
virtual bool erase(const PatternType &)=0
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:928
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:969
t_patternset::iterator iterator
Definition: patternstore.h:600
iterator(reference ref)
Definition: patternstore.h:173
std::set< Pattern > t_hashorderedpatternset
Definition: patternstore.h:698
iterator erase(const_iterator position)
Definition: patternstore.h:932
bool has(const Pattern &pattern) const
Definition: patternstore.h:795
const_iterator begin() const
Definition: patternstore.h:607
A pattern store in the form of an ordered set (i.e, no duplicates). Stores only patterns, no values.
Definition: patternstore.h:709