Colibri Core
pattern.h
Go to the documentation of this file.
1 #ifndef COLIBRIPATTERN_H
2 #define COLIBRIPATTERN_H
3 
4 /*****************************
5 * Colibri Core
6 * by Maarten van Gompel
7 * Centre for Language Studies
8 * Radboud University Nijmegen
9 *
10 * http://proycon.github.io/colibri-core
11 *
12 * Licensed under GPLv3
13 *****************************/
14 
15 #include <string>
16 #include <iostream>
17 #include <ostream>
18 #include <istream>
19 #include <unordered_map>
20 #include <vector>
21 #include <set>
22 #include <map>
23 #include <array>
24 #include <unordered_set>
25 #include <iomanip> // contains setprecision()
26 #include <exception>
27 #include <algorithm>
28 #include <climits>
29 #include "common.h"
30 #include "classdecoder.h"
31 
32 
47 const int MAINPATTERNBUFFERSIZE = 40960;
48 
54  NGRAM = 1,
55  SKIPGRAM = 2,
56  FLEXGRAM = 3,
57 };
58 
60  PATTERN = 0,
62 };
63 
64 void readanddiscardpattern(std::istream * in, bool pointerformat = false);
65 int reader_passmarker(const unsigned char c, std::istream * in);
66 
67 
68 
69 class PatternPointer;
70 
75 class Pattern {
76  public:
77  const static int patterntype = PATTERN;
78  unsigned char * data;
84  Pattern() { data = new unsigned char[1]; data[0] = ClassDecoder::delimiterclass; }
85 
92  Pattern(const unsigned char* dataref, const int size);
93 
100  Pattern(const Pattern& ref, unsigned int begin, unsigned int length);
101  Pattern(const PatternPointer& ref,unsigned int begin, unsigned int length);
102 
107  Pattern(const Pattern& ref);
108  Pattern(const PatternPointer& ref);
109 
117  Pattern(std::istream * in, bool ignoreeol = false, const unsigned char version = 2, const unsigned char * corpusstart = NULL, bool debug = false);
118  //Pattern(std::istream * in, unsigned char * buffer, int maxbuffersize, bool ignoreeol = false, const unsigned char version = 2, bool debug = false);
119 
120 
121  ~Pattern();
122 
123 
128  Pattern(int size) {
129  //pattern consisting only of fixed skips
130  data = new unsigned char[size+1];
131  for (int i = 0; i < size; i++) data[i] = ClassDecoder::skipclass;
133  }
134 
139  void write(std::ostream * out, const unsigned char * corpusstart = NULL) const;
140 
144  const size_t n() const;
145 
146 
151  const size_t bytesize() const;
152 
156  const size_t size() const { return n(); }
157 
158 
162  const unsigned int skipcount() const;
163 
167  const PatternCategory category() const;
168 
169 
170  const bool isskipgram() const { return category() == SKIPGRAM; }
171  const bool isflexgram() const { return category() == FLEXGRAM; }
172  const bool unknown() const;
173 
177  Pattern operator [](int index) const { return Pattern(*this, index,1); }
178 
182  const size_t hash() const;
183 
188  std::string tostring(const ClassDecoder& classdecoder) const; //pattern to string (decode)
189 
193  std::string decode(const ClassDecoder& classdecoder) const { return tostring(classdecoder); } //alias
194 
198  bool out() const;
199 
204  std::vector<unsigned int> tovector() const;
205 
206  bool operator==(const Pattern & other) const;
207  bool operator!=(const Pattern & other) const;
208 
209  bool operator==(const PatternPointer & other) const;
210  bool operator!=(const PatternPointer & other) const;
211 
215  void operator =(const Pattern & other);
216 
217 
222  bool operator<(const Pattern & other) const;
223  bool operator>(const Pattern & other) const;
224 
225  Pattern operator +(const Pattern&) const;
226 
227  PatternPointer getpointer() const;
228 
233  int find(const Pattern & subpattern) const;
234 
238  bool contains(const Pattern & subpattern) const; //does the pattern contain the smaller pattern?
239 
243  bool instanceof(const Pattern & skipgram) const;
244 
245 
251  int ngrams(std::vector<Pattern> & container, const int n) const;
252  int ngrams(std::vector<PatternPointer> & container, const int n) const;
253 
259  int subngrams(std::vector<Pattern> & container, int minn = 1, int maxn=99) const; //return all subsumed ngrams (variable n)
260  int subngrams(std::vector<PatternPointer> & container, int minn = 1, int maxn=99) const; //return all subsumed ngrams (variable n)
261 
268  int ngrams(std::vector<std::pair<Pattern,int>> & container, const int n) const; //return multiple ngrams
269  int ngrams(std::vector<std::pair<PatternPointer,int>> & container, const int n) const; //return multiple ngrams
270 
277  int subngrams(std::vector<std::pair<Pattern,int>> & container, int minn = 1, int maxn=9) const; //return all subsumed ngrams (variable n)
278  int subngrams(std::vector<std::pair<PatternPointer,int>> & container, int minn = 1, int maxn=9) const; //return all subsumed ngrams (variable n)
279 
283  int parts(std::vector<Pattern> & container) const;
284  int parts(std::vector<PatternPointer> & container) const;
285 
289  int parts(std::vector<std::pair<int,int> > & container) const;
290 
294  int gaps(std::vector<std::pair<int,int> > & container) const;
295 
296 
300  Pattern extractskipcontent(const Pattern & instance) const;
301 
306  Pattern replace(int begin, int length, const Pattern & replacement) const;
307 
313  Pattern addskip(const std::pair<int,int> & gap) const;
319  Pattern addskips(const std::vector<std::pair<int,int> > & gaps) const;
325  Pattern addflexgaps(const std::vector<std::pair<int,int> > & gaps) const;
326 
330  Pattern reverse() const;
331 
335  Pattern toflexgram() const;
336 
337 
338 
342  bool isgap(int i) const;
343 
344 
345  //NOT IMPLEMENTED YET:
346 
347  Pattern addcontext(const Pattern & leftcontext, const Pattern & rightcontext) const;
348  void mask(std::vector<bool> & container) const; //returns a boolean mask of the skipgram (0 = gap(encapsulation) , 1 = skipgram coverage)
349 
350  //sets an entirely new value
351  void set(const unsigned char* dataref, const int size);
352 };
353 
354 
355 Pattern patternfromfile(const std::string & filename); //helper function to read pattern from file, mostly for Cython
356 
358  public:
359  const static int patterntype = PATTERNPOINTER;
360  unsigned char * data;
361  uint32_t bytes; //number of bytes
362  uint32_t mask; //0 == NGRAM
363  //first bit high = flexgram, right-aligned, 0 = gap
364  //first bit low = skipgram, right-aligned, 0 = gap , max skipgram length 31 tokens
365 
367  data = NULL;
368  bytes = 0;
369  mask = 0;
370  }
371 
372  PatternPointer(unsigned char* dataref, const unsigned int bytesize) {
373  data = dataref;
374  if (bytesize > B32) {
375  std::cerr << "ERROR: Pattern too long for pattern pointer [" << bytesize << " bytes,explicit]" << std::endl;
376  throw InternalError();
377  }
378  bytes = bytesize;
379  mask = computemask();
380  }
381 
382  PatternPointer(const Pattern & ref) {
383  data = ref.data;
384  const size_t b = ref.bytesize();
385  if (b > B32) {
386  std::cerr << "ERROR: Pattern too long for pattern pointer [" << b << " bytes,implicit]" << std::endl;
387  throw InternalError();
388  }
389  bytes = b;
390  mask = computemask();
391  }
392  PatternPointer(const Pattern * ref) {
393  data = ref->data;
394  const size_t b = ref->bytesize();
395  if (b > B32) {
396  std::cerr << "ERROR: Pattern too long for pattern pointer [" << b << " bytes,implicit]" << std::endl;
397  throw InternalError();
398  }
399  bytes = b;
400  mask = computemask();
401  }
403  data = ref.data;
404  bytes = ref.bytes;
405  mask = ref.mask;
406  }
408  data = ref->data;
409  bytes = ref->bytes;
410  mask = ref->mask;
411  }
413  data = other.data;
414  bytes = other.bytes;
415  mask = other.mask;
416  // by convention, always return *this (for chaining)
417  return *this;
418  }
419  PatternPointer(std::istream * in, bool ignoreeol = false, const unsigned char version = 2, unsigned char * corpusstart = NULL, bool debug = false);
420 
425  void write(std::ostream * out, const unsigned char * corpusstart = NULL) const;
426 
427  //slice construtors:
428  PatternPointer(unsigned char *, unsigned int,unsigned int);
429  PatternPointer(const PatternPointer&, unsigned int,unsigned int);
430  PatternPointer(const Pattern&, unsigned int,unsigned int);
431 
432  uint32_t computemask() const;
433 
434  const size_t n() const;
435  const size_t bytesize() const { return bytes; }
436  const size_t size() const { return n(); }
437 
441  const size_t hash() const;
442 
443  const PatternCategory category() const;
444  const bool isskipgram() const { return category() == SKIPGRAM; }
445  const bool isflexgram() const { return category() == FLEXGRAM; }
446 
447  std::string tostring(const ClassDecoder& classdecoder) const; //pattern to string (decode)
448  std::string decode(const ClassDecoder& classdecoder) const { return tostring(classdecoder); } //pattern to string (decode)
449  bool out() const;
450 
451  bool operator==(const PatternPointer & other) const;
452  bool operator!=(const PatternPointer & other) const { return !(*this == other); }
453 
454  bool operator==(const Pattern & other) const;
455  bool operator!=(const Pattern & other) const { return !(*this == other); }
456 
457  PatternPointer toflexgram() const;
458  bool isgap(int i) const;
459 
460 
467 
468  bool operator<(const PatternPointer & other) const {
469  if (data == other.data) {
470  if (bytes == other.bytes) {
471  return mask < other.mask;
472  } else {
473  return bytes < other.bytes;
474  }
475  } else {
476  return data < other.data;
477  }
478  }
479 
480  int ngrams(std::vector<PatternPointer> & container, const int n) const;
481  int subngrams(std::vector<PatternPointer> & container, int minn = 1, int maxn=9) const; //return all subsumed ngrams (variable n)
482  int ngrams(std::vector<std::pair<PatternPointer,int>> & container, const int n) const; //return multiple ngrams
483  int subngrams(std::vector<std::pair<PatternPointer,int>> & container, int minn = 1, int maxn=9) const; //return all subsumed ngrams (variable n)
484 
488  int parts(std::vector<PatternPointer> & container) const;
489 
493  int parts(std::vector<std::pair<int,int> > & container) const;
494 
498  int gaps(std::vector<std::pair<int,int> > & container) const;
499 
503  const unsigned int skipcount() const;
504 
510  PatternPointer addskip(const std::pair<int,int> & gap) const;
511 
517  PatternPointer addskips(const std::vector<std::pair<int,int> > & gaps) const;
518 
522  int flexcollapse(unsigned char * collapseddata) const;
523 
524  bool instanceof(const Pattern & skipgram) const;
525 
526  operator Pattern() { return Pattern(*this); } //cast overload
527  Pattern pattern() const { return Pattern(*this); } //cast overload
528 };
529 
530 
531 static const unsigned char * tmp_unk = (const unsigned char *) "\2";
532 static const unsigned char * tmp_skipmarker = (const unsigned char *) "\3";
533 static const unsigned char * tmp_flexmarker = (const unsigned char *) "\4";
534 const Pattern SKIPPATTERN = Pattern((const unsigned char *) tmp_skipmarker,1);
535 const Pattern FLEXPATTERN = Pattern((const unsigned char*) tmp_flexmarker,1);
536 static const Pattern UNKPATTERN = Pattern((const unsigned char* ) tmp_unk,1);
537 
538 namespace std {
539 
540  template <>
541  struct hash<Pattern> {
542  public:
543  size_t operator()(const Pattern &pattern) const throw() {
544  return pattern.hash();
545  }
546  };
547 
548 
549 
550  template <>
551  struct hash<const Pattern *> {
552  public:
553  size_t operator()(const Pattern * pattern) const throw() {
554  return pattern->hash();
555  }
556  };
557 
558  template <>
559  struct hash<PatternPointer> {
560  public:
561  size_t operator()(const PatternPointer &pattern) const throw() {
562  return pattern.hash();
563  }
564  };
565 
566 
567 
568  template <>
569  struct hash<const PatternPointer *> {
570  public:
571  size_t operator()(const PatternPointer * pattern) const throw() {
572  return pattern->hash();
573  }
574  };
575 
576 }
577 
578 #endif
Pattern patternfromfile(const std::string &filename)
Definition: pattern.cpp:1839
bool out() const
Definition: pattern.cpp:345
bool operator<(const PatternPointer &other) const
Definition: pattern.h:468
const unsigned long long B32
Definition: common.h:23
uint32_t computemask() const
Definition: pattern.cpp:174
Pattern()
Definition: pattern.h:84
Definition: pattern.h:61
const size_t hash() const
Definition: pattern.cpp:198
const size_t size() const
Definition: pattern.h:436
int ngrams(std::vector< PatternPointer > &container, const int n) const
Definition: pattern.cpp:1072
size_t operator()(const Pattern &pattern) const
Definition: pattern.h:543
bool isgap(int i) const
Definition: pattern.cpp:98
void mask(std::vector< bool > &container) const
bool operator<(const Pattern &other) const
Definition: pattern.cpp:960
const bool unknown() const
Definition: pattern.cpp:349
size_t operator()(const PatternPointer *pattern) const
Definition: pattern.h:571
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:278
const bool isskipgram() const
Definition: pattern.h:170
std::string decode(const ClassDecoder &classdecoder) const
Definition: pattern.h:193
const int MAINPATTERNBUFFERSIZE
Definition: pattern.h:47
void set(const unsigned char *dataref, const int size)
Definition: pattern.cpp:594
Definition: pattern.h:357
bool instanceof(const Pattern &skipgram) const
Definition: pattern.cpp:1533
Definition: pattern.h:54
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
STL namespace.
bool contains(const Pattern &subpattern) const
Definition: pattern.cpp:1046
const size_t bytesize() const
Definition: pattern.cpp:57
bool instanceof(const Pattern &skipgram) const
Definition: pattern.cpp:1507
Pattern operator[](int index) const
Definition: pattern.h:177
const bool isflexgram() const
Definition: pattern.h:171
const size_t n() const
Definition: pattern.cpp:93
PatternCategory
Definition: pattern.h:52
bool operator!=(const PatternPointer &other) const
Definition: pattern.h:452
const Pattern SKIPPATTERN
Definition: pattern.h:534
int find(const Pattern &subpattern) const
Definition: pattern.cpp:1026
bool operator!=(const Pattern &other) const
Definition: pattern.cpp:953
void readanddiscardpattern(std::istream *in, bool pointerformat=false)
Definition: pattern.cpp:379
Pattern addskips(const std::vector< std::pair< int, int > > &gaps) const
Definition: pattern.cpp:1596
PatternPointer addskips(const std::vector< std::pair< int, int > > &gaps) const
Definition: pattern.cpp:1620
int flexcollapse(unsigned char *collapseddata) const
Definition: pattern.cpp:897
const PatternCategory category() const
Definition: pattern.cpp:42
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping...
Definition: classdecoder.h:43
int gaps(std::vector< std::pair< int, int > > &container) const
Definition: pattern.cpp:1407
Pattern toflexgram() const
Definition: pattern.cpp:131
Pattern operator+(const Pattern &) const
Definition: pattern.cpp:994
int subngrams(std::vector< PatternPointer > &container, int minn=1, int maxn=9) const
Definition: pattern.cpp:1142
Definition: pattern.h:56
const size_t bytesize() const
Definition: pattern.h:435
std::string decode(const ClassDecoder &classdecoder) const
Definition: pattern.h:448
Pattern addskip(const std::pair< int, int > &gap) const
Definition: pattern.cpp:1584
bool isgap(int i) const
Definition: pattern.cpp:126
void write(std::ostream *out, const unsigned char *corpusstart=NULL) const
Definition: pattern.cpp:236
PatternPointer getpointer() const
Definition: pattern.cpp:1835
const Pattern FLEXPATTERN
Definition: pattern.h:535
PatternPointer addskip(const std::pair< int, int > &gap) const
Definition: pattern.cpp:1611
PatternPointer & operator=(const PatternPointer &other)
Definition: pattern.h:412
const size_t size() const
Definition: pattern.h:156
uint32_t bytes
Definition: pattern.h:361
bool operator!=(const Pattern &other) const
Definition: pattern.h:455
PatternPointer & operator++()
Definition: pattern.cpp:1006
int gaps(std::vector< std::pair< int, int > > &container) const
Definition: pattern.cpp:1434
static const unsigned char skipclass
Definition: classdecoder.h:50
static const unsigned char delimiterclass
Definition: classdecoder.h:48
static const int patterntype
Definition: pattern.h:77
const size_t hash() const
Definition: pattern.cpp:202
int reader_passmarker(const unsigned char c, std::istream *in)
int ngrams(std::vector< Pattern > &container, const int n) const
Definition: pattern.cpp:1050
Pattern(int size)
Definition: pattern.h:128
Pattern addcontext(const Pattern &leftcontext, const Pattern &rightcontext) const
bool out() const
Definition: pattern.cpp:341
const unsigned int skipcount() const
Definition: pattern.cpp:1386
void operator=(const Pattern &other)
Definition: pattern.cpp:977
int subngrams(std::vector< Pattern > &container, int minn=1, int maxn=99) const
Definition: pattern.cpp:1120
std::vector< unsigned int > tovector() const
Definition: pattern.cpp:364
bool operator==(const Pattern &other) const
Definition: pattern.cpp:861
uint32_t mask
Definition: pattern.h:362
PatternPointer(const Pattern &ref)
Definition: pattern.h:382
const bool isflexgram() const
Definition: pattern.h:445
const PatternCategory category() const
Definition: pattern.cpp:46
void write(std::ostream *out, const unsigned char *corpusstart=NULL) const
Definition: pattern.cpp:228
const unsigned int skipcount() const
Definition: pattern.cpp:1366
size_t operator()(const PatternPointer &pattern) const
Definition: pattern.h:561
unsigned char * data
Definition: pattern.h:360
int parts(std::vector< PatternPointer > &container) const
Definition: pattern.cpp:1337
size_t operator()(const Pattern *pattern) const
Definition: pattern.h:553
const size_t n() const
Definition: pattern.cpp:89
unsigned char * data
Definition: pattern.h:78
Pattern reverse() const
Definition: pattern.cpp:1640
Pattern extractskipcontent(const Pattern &instance) const
Definition: pattern.cpp:1467
Definition: common.h:35
Basic largely trivial functions for the common good.
bool operator==(const PatternPointer &other) const
Definition: pattern.cpp:922
const bool isskipgram() const
Definition: pattern.h:444
PatternType
Definition: pattern.h:59
PatternPointer(const PatternPointer *ref)
Definition: pattern.h:407
Definition: pattern.h:53
Pattern pattern() const
Definition: pattern.h:527
Pattern replace(int begin, int length, const Pattern &replacement) const
Definition: pattern.cpp:1559
PatternPointer toflexgram() const
Definition: pattern.cpp:168
Measurement begin(const string &title)
Definition: benchmarks.cpp:148
Class for decoding binary class-encoded data back to plain-text.
PatternPointer(const PatternPointer &ref)
Definition: pattern.h:402
int parts(std::vector< Pattern > &container) const
Definition: pattern.cpp:1225
~Pattern()
Definition: pattern.cpp:853
Definition: pattern.h:60
bool operator>(const Pattern &other) const
Definition: pattern.cpp:973
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:283
Pattern addflexgaps(const std::vector< std::pair< int, int > > &gaps) const
Definition: pattern.cpp:1630
PatternPointer(const Pattern *ref)
Definition: pattern.h:392
Definition: pattern.h:55
PatternPointer()
Definition: pattern.h:366
static const int patterntype
Definition: pattern.h:359
PatternPointer(unsigned char *dataref, const unsigned int bytesize)
Definition: pattern.h:372