1 #ifndef COLIBRIPATTERN_H
2 #define COLIBRIPATTERN_H
19 #include <unordered_map>
24 #include <unordered_set>
92 Pattern(
const unsigned char* dataref,
const int size);
117 Pattern(std::istream * in,
bool ignoreeol =
false,
const unsigned char version = 2,
const unsigned char * corpusstart = NULL,
bool debug =
false);
130 data =
new unsigned char[size+1];
139 void write(std::ostream *
out,
const unsigned char * corpusstart = NULL)
const;
144 const size_t n()
const;
156 const size_t size()
const {
return n(); }
182 const size_t hash()
const;
204 std::vector<unsigned int>
tovector()
const;
251 int ngrams(std::vector<Pattern> & container,
const int n)
const;
252 int ngrams(std::vector<PatternPointer> & container,
const int n)
const;
259 int subngrams(std::vector<Pattern> & container,
int minn = 1,
int maxn=99)
const;
260 int subngrams(std::vector<PatternPointer> & container,
int minn = 1,
int maxn=99)
const;
268 int ngrams(std::vector<std::pair<Pattern,int>> & container,
const int n)
const;
269 int ngrams(std::vector<std::pair<PatternPointer,int>> & container,
const int n)
const;
277 int subngrams(std::vector<std::pair<Pattern,int>> & container,
int minn = 1,
int maxn=9)
const;
278 int subngrams(std::vector<std::pair<PatternPointer,int>> & container,
int minn = 1,
int maxn=9)
const;
283 int parts(std::vector<Pattern> & container)
const;
284 int parts(std::vector<PatternPointer> & container)
const;
289 int parts(std::vector<std::pair<int,int> > & container)
const;
294 int gaps(std::vector<std::pair<int,int> > & container)
const;
342 bool isgap(
int i)
const;
348 void mask(std::vector<bool> & container)
const;
351 void set(
const unsigned char* dataref,
const int size);
374 if (bytesize >
B32) {
375 std::cerr <<
"ERROR: Pattern too long for pattern pointer [" << bytesize <<
" bytes,explicit]" << std::endl;
386 std::cerr <<
"ERROR: Pattern too long for pattern pointer [" << b <<
" bytes,implicit]" << std::endl;
396 std::cerr <<
"ERROR: Pattern too long for pattern pointer [" << b <<
" bytes,implicit]" << std::endl;
419 PatternPointer(std::istream * in,
bool ignoreeol =
false,
const unsigned char version = 2,
unsigned char * corpusstart = NULL,
bool debug =
false);
425 void write(std::ostream *
out,
const unsigned char * corpusstart = NULL)
const;
434 const size_t n()
const;
436 const size_t size()
const {
return n(); }
441 const size_t hash()
const;
458 bool isgap(
int i)
const;
469 if (data == other.
data) {
470 if (bytes == other.
bytes) {
471 return mask < other.
mask;
473 return bytes < other.
bytes;
476 return data < other.
data;
480 int ngrams(std::vector<PatternPointer> & container,
const int n)
const;
481 int subngrams(std::vector<PatternPointer> & container,
int minn = 1,
int maxn=9)
const;
482 int ngrams(std::vector<std::pair<PatternPointer,int>> & container,
const int n)
const;
483 int subngrams(std::vector<std::pair<PatternPointer,int>> & container,
int minn = 1,
int maxn=9)
const;
488 int parts(std::vector<PatternPointer> & container)
const;
493 int parts(std::vector<std::pair<int,int> > & container)
const;
498 int gaps(std::vector<std::pair<int,int> > & container)
const;
531 static const unsigned char * tmp_unk = (
const unsigned char *)
"\2";
532 static const unsigned char * tmp_skipmarker = (
const unsigned char *)
"\3";
533 static const unsigned char * tmp_flexmarker = (
const unsigned char *)
"\4";
536 static const Pattern UNKPATTERN =
Pattern((
const unsigned char* ) tmp_unk,1);
544 return pattern.hash();
554 return pattern->hash();
562 return pattern.hash();
572 return pattern->hash();
Pattern patternfromfile(const std::string &filename)
Definition: pattern.cpp:1839
bool out() const
Definition: pattern.cpp:345
bool operator<(const PatternPointer &other) const
Definition: pattern.h:468
const unsigned long long B32
Definition: common.h:23
uint32_t computemask() const
Definition: pattern.cpp:174
Pattern()
Definition: pattern.h:84
const size_t hash() const
Definition: pattern.cpp:198
const size_t size() const
Definition: pattern.h:436
int ngrams(std::vector< PatternPointer > &container, const int n) const
Definition: pattern.cpp:1072
size_t operator()(const Pattern &pattern) const
Definition: pattern.h:543
bool isgap(int i) const
Definition: pattern.cpp:98
void mask(std::vector< bool > &container) const
bool operator<(const Pattern &other) const
Definition: pattern.cpp:960
const bool unknown() const
Definition: pattern.cpp:349
size_t operator()(const PatternPointer *pattern) const
Definition: pattern.h:571
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:278
const bool isskipgram() const
Definition: pattern.h:170
std::string decode(const ClassDecoder &classdecoder) const
Definition: pattern.h:193
const int MAINPATTERNBUFFERSIZE
Definition: pattern.h:47
void set(const unsigned char *dataref, const int size)
Definition: pattern.cpp:594
Definition: pattern.h:357
bool instanceof(const Pattern &skipgram) const
Definition: pattern.cpp:1533
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
bool contains(const Pattern &subpattern) const
Definition: pattern.cpp:1046
const size_t bytesize() const
Definition: pattern.cpp:57
bool instanceof(const Pattern &skipgram) const
Definition: pattern.cpp:1507
Pattern operator[](int index) const
Definition: pattern.h:177
const bool isflexgram() const
Definition: pattern.h:171
const size_t n() const
Definition: pattern.cpp:93
PatternCategory
Definition: pattern.h:52
bool operator!=(const PatternPointer &other) const
Definition: pattern.h:452
const Pattern SKIPPATTERN
Definition: pattern.h:534
int find(const Pattern &subpattern) const
Definition: pattern.cpp:1026
bool operator!=(const Pattern &other) const
Definition: pattern.cpp:953
void readanddiscardpattern(std::istream *in, bool pointerformat=false)
Definition: pattern.cpp:379
Pattern addskips(const std::vector< std::pair< int, int > > &gaps) const
Definition: pattern.cpp:1596
PatternPointer addskips(const std::vector< std::pair< int, int > > &gaps) const
Definition: pattern.cpp:1620
int flexcollapse(unsigned char *collapseddata) const
Definition: pattern.cpp:897
const PatternCategory category() const
Definition: pattern.cpp:42
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping...
Definition: classdecoder.h:43
int gaps(std::vector< std::pair< int, int > > &container) const
Definition: pattern.cpp:1407
Pattern toflexgram() const
Definition: pattern.cpp:131
Pattern operator+(const Pattern &) const
Definition: pattern.cpp:994
int subngrams(std::vector< PatternPointer > &container, int minn=1, int maxn=9) const
Definition: pattern.cpp:1142
const size_t bytesize() const
Definition: pattern.h:435
std::string decode(const ClassDecoder &classdecoder) const
Definition: pattern.h:448
Pattern addskip(const std::pair< int, int > &gap) const
Definition: pattern.cpp:1584
bool isgap(int i) const
Definition: pattern.cpp:126
void write(std::ostream *out, const unsigned char *corpusstart=NULL) const
Definition: pattern.cpp:236
PatternPointer getpointer() const
Definition: pattern.cpp:1835
const Pattern FLEXPATTERN
Definition: pattern.h:535
PatternPointer addskip(const std::pair< int, int > &gap) const
Definition: pattern.cpp:1611
PatternPointer & operator=(const PatternPointer &other)
Definition: pattern.h:412
const size_t size() const
Definition: pattern.h:156
uint32_t bytes
Definition: pattern.h:361
bool operator!=(const Pattern &other) const
Definition: pattern.h:455
PatternPointer & operator++()
Definition: pattern.cpp:1006
int gaps(std::vector< std::pair< int, int > > &container) const
Definition: pattern.cpp:1434
static const unsigned char skipclass
Definition: classdecoder.h:50
static const unsigned char delimiterclass
Definition: classdecoder.h:48
static const int patterntype
Definition: pattern.h:77
const size_t hash() const
Definition: pattern.cpp:202
int reader_passmarker(const unsigned char c, std::istream *in)
int ngrams(std::vector< Pattern > &container, const int n) const
Definition: pattern.cpp:1050
Pattern(int size)
Definition: pattern.h:128
Pattern addcontext(const Pattern &leftcontext, const Pattern &rightcontext) const
bool out() const
Definition: pattern.cpp:341
const unsigned int skipcount() const
Definition: pattern.cpp:1386
void operator=(const Pattern &other)
Definition: pattern.cpp:977
int subngrams(std::vector< Pattern > &container, int minn=1, int maxn=99) const
Definition: pattern.cpp:1120
std::vector< unsigned int > tovector() const
Definition: pattern.cpp:364
bool operator==(const Pattern &other) const
Definition: pattern.cpp:861
uint32_t mask
Definition: pattern.h:362
PatternPointer(const Pattern &ref)
Definition: pattern.h:382
const bool isflexgram() const
Definition: pattern.h:445
const PatternCategory category() const
Definition: pattern.cpp:46
void write(std::ostream *out, const unsigned char *corpusstart=NULL) const
Definition: pattern.cpp:228
const unsigned int skipcount() const
Definition: pattern.cpp:1366
size_t operator()(const PatternPointer &pattern) const
Definition: pattern.h:561
unsigned char * data
Definition: pattern.h:360
int parts(std::vector< PatternPointer > &container) const
Definition: pattern.cpp:1337
size_t operator()(const Pattern *pattern) const
Definition: pattern.h:553
const size_t n() const
Definition: pattern.cpp:89
unsigned char * data
Definition: pattern.h:78
Pattern reverse() const
Definition: pattern.cpp:1640
Pattern extractskipcontent(const Pattern &instance) const
Definition: pattern.cpp:1467
Basic largely trivial functions for the common good.
bool operator==(const PatternPointer &other) const
Definition: pattern.cpp:922
const bool isskipgram() const
Definition: pattern.h:444
PatternType
Definition: pattern.h:59
PatternPointer(const PatternPointer *ref)
Definition: pattern.h:407
Pattern pattern() const
Definition: pattern.h:527
Pattern replace(int begin, int length, const Pattern &replacement) const
Definition: pattern.cpp:1559
PatternPointer toflexgram() const
Definition: pattern.cpp:168
Measurement begin(const string &title)
Definition: benchmarks.cpp:148
Class for decoding binary class-encoded data back to plain-text.
PatternPointer(const PatternPointer &ref)
Definition: pattern.h:402
int parts(std::vector< Pattern > &container) const
Definition: pattern.cpp:1225
~Pattern()
Definition: pattern.cpp:853
bool operator>(const Pattern &other) const
Definition: pattern.cpp:973
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:283
Pattern addflexgaps(const std::vector< std::pair< int, int > > &gaps) const
Definition: pattern.cpp:1630
PatternPointer(const Pattern *ref)
Definition: pattern.h:392
PatternPointer()
Definition: pattern.h:366
static const int patterntype
Definition: pattern.h:359
PatternPointer(unsigned char *dataref, const unsigned int bytesize)
Definition: pattern.h:372