Colibri Core
patternstore.h
Go to the documentation of this file.
1 #ifndef COLIBRIPATTERNSTORE_H
2 #define COLIBRIPATTERNSTORE_H
3 
4 #include <string>
5 #include <iostream>
6 #include <ostream>
7 #include <istream>
8 #include <unordered_map>
9 #include <vector>
10 #include <set>
11 #include <map>
12 #include <array>
13 #include <unordered_set>
14 #include <iomanip> // contains setprecision()
15 #include <exception>
16 #include <algorithm>
17 #include "common.h"
18 #include "pattern.h"
19 #include "datatypes.h"
20 #include "classdecoder.h"
21 #include "classencoder.h"
22 
23 /***********************************************************************************/
24 
39 typedef std::pair<IndexReference,PatternPointer> IndexPattern;
45  protected:
46  unsigned char * corpus;
47  unsigned int corpussize; //in bytes
48  PatternPointer * patternpointer; //pattern pointer covering the whole corpus
49  unsigned int totaltokens;
50  std::map<uint32_t,unsigned char*> sentenceindex; //sentence pointers
51  public:
53  corpus = NULL;
54  corpussize = 0;
55  totaltokens = 0; //will be computed when queried
56  patternpointer = NULL;
57  };
58 
59  /*
60  * Low-level constructor
61  */
62  IndexedCorpus(unsigned char * corpus, unsigned int corpussize) {
63  this->corpus = corpus;
64  this->corpussize = 0;
65  totaltokens = 0; //will be computed when queried
66  patternpointer = new PatternPointer(corpus,corpussize);
67  }
68 
69  /*
70  * Read an indexed corpus from stream. The stream must correspond to an
71  * encoded corpus (*.colibri.dat)
72  */
73  IndexedCorpus(std::istream *in, bool debug = false);
74  /*
75  * Read an indexed corpus from file. The filename must correspond to an
76  * encoded corpus (*.colibri.dat)
77  */
78  IndexedCorpus(std::string filename, bool debug = false);
79 
81  if (corpus != NULL) delete[] corpus;
82  if (patternpointer != NULL) delete patternpointer;
83  }
84 
85  /*
86  * Read an indexed corpus from stream. The stream must correspond to an
87  * encoded corpus (*.colibri.dat)
88  */
89  void load(std::istream *in, bool debug = false);
90 
91  /*
92  * Read an indexed corpus from file. The filename must correspond to an
93  * encoded corpus (*.colibri.dat)
94  */
95  void load(std::string filename, bool debug = false);
96 
97 
98 
104  unsigned char * getpointer(const IndexReference & begin) const;
105 
110  PatternPointer getpattern(const IndexReference & begin, int length=1) const;
111 
113  return *patternpointer;
114  }
115  unsigned char * beginpointer() const {
116  return corpus;
117  }
118  unsigned int bytesize() const {
119  return corpussize;
120  }
121 
126  PatternPointer getsentence(int sentence) const; //returns sentence as a pattern pointer
127  PatternPointer getsentence(unsigned char * sentencedata) const; //returns sentence as a pattern pointer
128 
136  std::vector<IndexReference> findpattern(const Pattern & pattern, uint32_t sentence = 0, int maxmatches=0);
137  void findpattern(std::vector<IndexReference> & result, const Pattern & pattern, uint32_t sentence, const PatternPointer & sentencedata, int maxmatches=0);
138 
143  int sentencelength(int sentence) const;
144  int sentencelength(unsigned char * sentencebegin) const;
145 
150  unsigned int sentences() const { return sentenceindex.size(); } //returns the number of sentences (1-indexed)
151 
152 
156  class iterator {
157  public:
162  typedef std::forward_iterator_tag iterator_category;
163  typedef int difference_type;
164  iterator(const self_type & ref) { //copy constructor
165  pairpointer = new std::pair<IndexReference,PatternPointer>(*ref.pairpointer);
166  }
167  iterator(pointer ptr) {
168  pairpointer = new std::pair<IndexReference,PatternPointer>(*ptr);
169  }
171  pairpointer = new std::pair<IndexReference,PatternPointer>(iref, pp);
172  }
173  iterator(reference ref) {
174  pairpointer = new std::pair<IndexReference,PatternPointer>(ref.first, ref.second);
175  }
176  iterator() { //default constructor, required for cython
177  pairpointer = NULL;
178  }
180  if (pairpointer != NULL) delete pairpointer;
181  }
182  self_type operator++() {
183  next();
184  return *this;
185  } //prefix
186 
187  void next() {
188  ++(pairpointer->second);
189  if (*(pairpointer->second.data) == ClassDecoder::delimiterclass) {
190  //we never stop at delimiterclasses, iterate again:
191  pairpointer->first.sentence++;
192  pairpointer->first.token = 0;
193  ++(pairpointer->second);
194  } else {
195  pairpointer->first.token++;
196  }
197  //Note: At the end of the data, the patternpointer is out of bounds, checking against end() should work fine though
198  }
199  self_type operator++(int junk) { self_type tmpiter = *this; next(); return *tmpiter; } //postfix
200  reference operator*() { return *pairpointer; }
201  pointer operator->() { return pairpointer; }
202  bool operator==(self_type rhs) { return pairpointer->first == rhs->first; }
203  bool operator!=(self_type rhs) { return pairpointer->first != rhs->first; }
204  void debug() {
205  std::cerr << (size_t) pairpointer << std::endl;
206  }
207  protected:
208  pointer pairpointer;
209  };
210 
211  /*
212  * Returns the begin iterator over the corpus
213  */
215  IndexReference iref = IndexReference(1,0);
216  PatternPointer p = getpattern(iref,1);
217  return iterator(iref,p);
218  }
219  //const_iterator begin() const { return data.begin(); }
220 
221  /*
222  * Returns the end iterator of the corpus
223  */
225  IndexReference iref = IndexReference(sentences() + 1,0);
226  PatternPointer p = PatternPointer(corpus,corpussize+1); //will be an invalid pointer, should never be used though
227  return iterator(iref,p);
228  }
229  //const_iterator end() const { return data.end(); }
230 
235  iterator find(const IndexReference & ref) {
236  try {
237  PatternPointer p = getpattern(ref);
238  return iterator(ref,p);
239  } catch (KeyError &e) {
240  return end();
241  }
242  }
247  /*const_iterator find(const IndexReference & ref) const {
248  return std::lower_bound(this->begin(), this->end(), IndexPattern(ref) ); //does binary search
249  }*/
250 
254  bool has(const IndexReference & ref) const {
255  return (getpointer(ref) != NULL);
256  }
257 
261  size_t size() {
262  if (totaltokens > 0) return totaltokens;
263  totaltokens = patternpointer->n();
264  return totaltokens;
265  }
266 
270  bool empty() const { return (corpussize <= 1); }
271 
272 
279  unsigned int operator [](const IndexReference & ref) {
280  try {
281  PatternPointer pp = getpattern(ref);
282  return bytestoint(pp.data);
283  } catch (KeyError &e) {
284  throw e;
285  }
286  }
287 
288 
289 
290 
291 };
292 
293 
294 /************* Base abstract container for pattern storage ********************/
295 
299 class PatternStoreInterface {
300  public:
304  virtual bool has(const Pattern &) const =0;
308  virtual bool has(const PatternPointer &) const =0;
312  virtual size_t size() const =0;
313 };
314 
323 template<class ContainerType,class ReadWriteSizeType = uint64_t,class PatternType = Pattern>
325  protected:
326  unsigned char * corpusstart; //used only when PatternType=PatternPointer
327  unsigned int corpussize;
328  unsigned char classencodingversion;
330  public:
331  PatternStore<ContainerType,ReadWriteSizeType,PatternType>() {corpusstart = NULL; corpussize = 0; classencodingversion = 2; patterntype = PatternType::patterntype; };
333 
334  virtual void attachcorpus(unsigned char * corpusstart, unsigned int corpussize) {
335  this->corpusstart = corpusstart;
336  this->corpussize = corpussize;
337  }
338  virtual void attachcorpus(const IndexedCorpus & corpus) {
339  this->corpusstart = corpus.beginpointer();
340  this->corpussize = corpus.bytesize();
341  }
342  virtual void detachcorpus() {
343  this->corpusstart = NULL;
344  this->corpussize = 0;
345  }
346  unsigned char * getcorpus() const {
347  return corpusstart;
348  }
349  unsigned int getcorpussize() const {
350  return corpussize;
351  }
352 
353 
357  virtual void use_v1_format() { this->classencodingversion = 1; }
358 
359  virtual void insert(const PatternType & pattern)=0; //might be a noop in some implementations that require a value
360 
361  virtual bool has(const Pattern &) const =0;
362  virtual bool has(const PatternPointer &) const =0;
363 
364  virtual bool erase(const PatternType &) =0;
365 
366  virtual size_t size() const =0;
367  virtual void reserve(size_t) =0; //might be a noop in some implementations
368 
369 
370  typedef typename ContainerType::iterator iterator;
371  typedef typename ContainerType::const_iterator const_iterator;
372 
373  virtual typename ContainerType::iterator begin()=0;
374  virtual typename ContainerType::iterator end()=0;
375  virtual typename ContainerType::iterator find(const Pattern & pattern)=0;
376  virtual typename ContainerType::iterator find(const PatternPointer & pattern)=0;
377 
378  virtual void write(std::ostream * out)=0;
379  //virtual void read(std::istream * in, int MINTOKENS)=0;
380 
382  return (PatternStoreInterface*) this;
383  }
384 };
385 
386 
387 /************* Abstract datatype for all kinds of maps ********************/
388 
389 
397 template<class ContainerType, class ValueType, class ValueHandler,class ReadWriteSizeType = uint32_t,class PatternType=Pattern>
398 class PatternMapStore: public PatternStore<ContainerType,ReadWriteSizeType,PatternType> {
399  protected:
400  ValueHandler valuehandler;
401  public:
404 
405 
406  virtual void insert(const PatternType & pattern, ValueType & value)=0;
407 
408  virtual bool has(const Pattern &) const =0;
409  virtual bool has(const PatternPointer &) const =0;
410 
411  virtual bool erase(const PatternType &) =0;
412 
413 
414  virtual size_t size() const =0;
415  virtual void reserve(size_t) =0;
416 
417 
418  virtual ValueType & operator [](const Pattern & pattern)=0;
419  virtual ValueType & operator [](const PatternPointer & pattern)=0;
420 
421  typedef typename ContainerType::iterator iterator;
422  typedef typename ContainerType::const_iterator const_iterator;
423 
424  virtual typename ContainerType::iterator begin()=0;
425  virtual typename ContainerType::iterator end()=0;
426  virtual typename ContainerType::iterator find(const Pattern & pattern)=0;
427  virtual typename ContainerType::iterator find(const PatternPointer & pattern)=0;
428 
432  virtual void write(std::ostream * out) {
433  ReadWriteSizeType s = (ReadWriteSizeType) size();
434  out->write( (char*) &s, sizeof(ReadWriteSizeType));
435  for (iterator iter = this->begin(); iter != this->end(); iter++) {
436  PatternType p = iter->first;
437  p.write(out, this->corpusstart);
438  this->valuehandler.write(out, iter->second);
439  }
440  }
441 
445  virtual void write(std::string filename) {
446  std::ofstream * out = new std::ofstream(filename.c_str());
447  this->write(out);
448  out->close();
449  delete out;
450  }
451 
452 
456  template<class ReadValueType=ValueType, class ReadValueHandler=ValueHandler,class ReadPatternType=PatternType>
457  void read(std::istream * in, int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface * constrainstore = NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true, bool DORESET=false, bool DEBUG=false) {
458  ReadValueHandler readvaluehandler = ReadValueHandler();
459  ReadWriteSizeType s; //read size:
460  ReadPatternType p;
461  in->read( (char*) &s, sizeof(ReadWriteSizeType));
462  reserve(s);
463  if (DEBUG) std::cerr << "Reading " << s << " patterns, classencodingversion=" << (int) this->classencodingversion << ", @corpusstart=" << (size_t) this->corpusstart << std::endl;
464  if (MINTOKENS == -1) MINTOKENS = 0;
465  for (ReadWriteSizeType i = 0; i < s; i++) {
466  try {
467  p = ReadPatternType(in, false, this->classencodingversion, this->corpusstart, DEBUG);
468  } catch (std::exception &e) {
469  std::cerr << "ERROR: Exception occurred at pattern " << (i+1) << " of " << s << std::endl;
470  throw InternalError();
471  }
472  if (!DONGRAMS || !DOSKIPGRAMS || !DOFLEXGRAMS) {
473  const PatternCategory c = p.category();
474  if ((!DONGRAMS && c == NGRAM) || (!DOSKIPGRAMS && c == SKIPGRAM) || (!DOFLEXGRAMS && c == FLEXGRAM)) continue;
475  }
476  const int n = p.size();
477  if (DEBUG) std::cerr << "Read pattern #" << (i+1) << ", size=" << n << ", valuehandler=" << readvaluehandler.id() << ", classencodingversion="<<(int)this->classencodingversion;
478  ReadValueType readvalue;
479  readvaluehandler.read(in, readvalue);
480  if (n >= MINLENGTH && n <= MAXLENGTH) {
481  if ((readvaluehandler.count(readvalue) >= (unsigned int) MINTOKENS) && ((constrainstore == NULL) || (constrainstore->has(p)))) {
482  ValueType * convertedvalue = NULL;
483  if (DORESET) {
484  convertedvalue = new ValueType();
485  } else {
486  readvaluehandler.convertto(&readvalue, convertedvalue);
487  if (DEBUG) std::cerr << "...converted";
488  if (convertedvalue == NULL) {
489  if (DEBUG) std::cerr << std::endl;
490  std::cerr << "ERROR: Converted value yielded NULL at pattern #" << (i+1) << ", size=" << n << ", valuehandler=" << readvaluehandler.id() <<std::endl;
491  throw InternalError();
492  }
493  }
494  if (DEBUG) std::cerr << "...adding";
495  this->insert(p,*convertedvalue);
496  if ((convertedvalue != NULL) && ((void*) convertedvalue != (void*) &readvalue)) delete convertedvalue;
497  } else if (DEBUG) {
498  if (readvaluehandler.count(readvalue) < (unsigned int) MINTOKENS) {
499  std::cerr << "...skipping because of occurrence (" << readvaluehandler.count(readvalue) << " below " << MINTOKENS;
500  } else {
501  std::cerr << "...skipping because of constraints";
502  }
503  }
504  } else if (DEBUG) {
505  std::cerr << "...skipping because of length";
506  }
507  if (DEBUG) std::cerr << std::endl;
508  }
509  }
510 
514  void read(std::string filename,int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface * constrainstore = NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true, bool DORESET = false, bool DEBUG=false) { //no templates for this one, easier on python/cython
515  std::ifstream * in = new std::ifstream(filename.c_str());
516  this->read<ValueType,ValueHandler>(in,MINTOKENS,MINLENGTH,MAXLENGTH,constrainstore,DONGRAMS,DOSKIPGRAMS,DOFLEXGRAMS, DORESET, DEBUG);
517  in->close();
518  delete in;
519  }
520 
521 };
522 
523 
524 /************* Specific STL-like containers for patterns ********************/
525 
526 
527 
528 typedef std::unordered_set<Pattern> t_patternset;
529 
537 template<class ReadWriteSizeType = uint32_t>
538 class PatternSet: public PatternStore<t_patternset,ReadWriteSizeType,Pattern> {
539  protected:
541  public:
542 
547 
548 
553  for (ClassDecoder::const_iterator iter = classdecoder.begin(); iter != classdecoder.end(); iter++) {
554  const int cls = iter->first;
555  int length; //will be set by inttobytes
556  unsigned char * buffer = inttobytes(cls, length);
557  data.insert( Pattern(buffer, length) );
558  delete buffer;
559  }
560  }
561 
566  for (ClassEncoder::const_iterator iter = classencoder.begin(); iter != classencoder.end(); iter++) {
567  const int cls = iter->second;
568  int length; //will be set by inttobytes
569  unsigned char * buffer = inttobytes(cls, length);
570  data.insert( Pattern(buffer, length) );
571  delete buffer;
572  }
573  }
574 
576 
580  void insert(const Pattern & pattern) {
581  data.insert(pattern);
582  }
583 
587  bool has(const Pattern & pattern) const { return data.count(pattern); }
588 
592  bool has(const PatternPointer & pattern) const { return data.count(pattern); }
593 
597  size_t size() const { return data.size(); }
598  void reserve(size_t s) { data.reserve(s); }
599 
600  typedef t_patternset::iterator iterator;
601  typedef t_patternset::const_iterator const_iterator;
602 
606  iterator begin() { return data.begin(); }
607  const_iterator begin() const { return data.begin(); }
608 
609  iterator end() { return data.end(); }
610  const_iterator end() const { return data.end(); }
611 
616  iterator find(const Pattern & pattern) { return data.find(pattern); }
617  const_iterator find(const Pattern & pattern) const { return data.find(pattern); }
618 
619  iterator find(const PatternPointer & pattern) { return data.find(pattern); }
620  const_iterator find(const PatternPointer & pattern) const { return data.find(pattern); }
621 
625  bool erase(const Pattern & pattern) { return data.erase(pattern); }
626  iterator erase(const_iterator position) { return data.erase(position); }
627 
628 
632  void write(std::ostream * out) {
633  ReadWriteSizeType s = (ReadWriteSizeType) size();
634  out->write( (char*) &s, sizeof(ReadWriteSizeType));
635  for (iterator iter = begin(); iter != end(); iter++) {
636  Pattern p = *iter;
637  p.write(out, this->corpusstart);
638  }
639  }
640 
644  void read(std::istream * in, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface * constrainstore = NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true) {
645  ReadWriteSizeType s; //read size:
646  in->read( (char*) &s, sizeof(ReadWriteSizeType));
647  reserve(s);
648  for (unsigned int i = 0; i < s; i++) {
649  Pattern p = Pattern(in, false, this->classencodingversion);
650  if (!DONGRAMS || !DOSKIPGRAMS || !DOFLEXGRAMS) {
651  const PatternCategory c = p.category();
652  if ((!DONGRAMS && c == NGRAM) || (!DOSKIPGRAMS && c == SKIPGRAM) || (!DOFLEXGRAMS && c == FLEXGRAM)) continue;
653  }
654  const int n = p.size();
655  if ((n >= MINLENGTH && n <= MAXLENGTH) && ((constrainstore == NULL) || (constrainstore->has(p)))) {
656  insert(p);
657  }
658  }
659  }
660 
665  template<class ReadValueType, class ReadValueHandler=BaseValueHandler<ReadValueType>>
666  void readmap(std::istream * in, int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface * constrainstore = NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true) {
667  ReadValueHandler readvaluehandler = ReadValueHandler();
668  ReadWriteSizeType s; //read size:
669  in->read( (char*) &s, sizeof(ReadWriteSizeType));
670  reserve(s);
671  //std::cerr << "Reading " << (int) s << " patterns" << std::endl;
672  for (ReadWriteSizeType i = 0; i < s; i++) {
673  Pattern p;
674  try {
675  p = Pattern(in, false, this->classencodingversion);
676  } catch (std::exception &e) {
677  std::cerr << "ERROR: Exception occurred at pattern " << (i+1) << " of " << s << std::endl;
678  throw InternalError();
679  }
680  if (!DONGRAMS || !DOSKIPGRAMS || !DOFLEXGRAMS) {
681  const PatternCategory c = p.category();
682  if ((!DONGRAMS && c == NGRAM) || (!DOSKIPGRAMS && c == SKIPGRAM) || (!DOFLEXGRAMS && c == FLEXGRAM)) continue;
683  }
684  const int n = p.size();
685  ReadValueType readvalue;
686  //std::cerr << "Read pattern: " << std::endl;
687  readvaluehandler.read(in, readvalue);
688  if (n >= MINLENGTH && n <= MAXLENGTH) {
689  if ((readvaluehandler.count(readvalue) >= (unsigned int) MINTOKENS) && ((constrainstore == NULL) || (constrainstore->has(p)))) {
690  this->insert(p);
691  }
692  }
693  }
694  }
695 };
696 
697 
698 typedef std::set<Pattern> t_hashorderedpatternset;
699 
700 
708 template<class ReadWriteSizeType = uint64_t>
709 class HashOrderedPatternSet: public PatternStore<t_hashorderedpatternset,ReadWriteSizeType> {
710  protected:
712  public:
713 
716 
717  void insert(const Pattern pattern) {
718  data.insert(pattern);
719  }
720 
721  bool has(const Pattern & pattern) const { return data.count(pattern); }
722  bool has(const PatternPointer & pattern) const { return data.count(pattern); }
723  size_t size() const { return data.size(); }
724  void reserve(size_t s) {} //noop
725 
726  typedef t_hashorderedpatternset::iterator iterator;
727  typedef t_hashorderedpatternset::const_iterator const_iterator;
728 
729  iterator begin() { return data.begin(); }
730  const_iterator begin() const { return data.begin(); }
731 
732  iterator end() { return data.end(); }
733  const_iterator end() const { return data.end(); }
734 
735  iterator find(const Pattern & pattern) { return data.find(pattern); }
736  const_iterator find(const Pattern & pattern) const { return data.find(pattern); }
737  iterator find(const PatternPointer & pattern) { return data.find(pattern); }
738  const_iterator find(const PatternPointer & pattern) const { return data.find(pattern); }
739 
740  bool erase(const Pattern & pattern) { return data.erase(pattern); }
741  iterator erase(const_iterator position) { return data.erase(position); }
742 
743 
744  void write(std::ostream * out) {
745  ReadWriteSizeType s = (ReadWriteSizeType) size();
746  out->write( (char*) &s, sizeof(ReadWriteSizeType));
747  for (iterator iter = begin(); iter != end(); iter++) {
748  Pattern p = *iter;
749  p.write(out, this->corpusstart);
750  }
751  }
752 
753  void read(std::istream * in, int MINLENGTH=0, int MAXLENGTH=999999, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true) {
754  ReadWriteSizeType s; //read size:
755  in->read( (char*) &s, sizeof(ReadWriteSizeType));
756  reserve(s);
757  for (unsigned int i = 0; i < s; i++) {
758  Pattern p = Pattern(in, false, this->classencodingversion);
759  if (!DONGRAMS || !DOSKIPGRAMS || !DOFLEXGRAMS) {
760  const PatternCategory c = p.category();
761  if ((!DONGRAMS && c == NGRAM) || (!DOSKIPGRAMS && c == SKIPGRAM) || (!DOFLEXGRAMS && c == FLEXGRAM)) continue;
762  }
763  const int n = p.size();
764  if (n >= MINLENGTH && n <= MAXLENGTH) {
765  insert(p);
766  }
767  }
768  }
769 
770 };
771 
772 
781 template<class ValueType, class ValueHandler = BaseValueHandler<ValueType>, class ReadWriteSizeType = uint64_t>
782 class PatternMap: public PatternMapStore<std::unordered_map<Pattern,ValueType>,ValueType,ValueHandler,ReadWriteSizeType,Pattern> {
783  protected:
784  std::unordered_map<Pattern, ValueType> data;
785  public:
786  //PatternMap(): PatternMapStore<std::unordered_map<const Pattern, ValueType>,ValueType,ValueHandler,ReadWriteSizeType>() {};
788 
789  void insert(const Pattern & pattern, ValueType & value) {
790  data[pattern] = value;
791  }
792 
793  void insert(const Pattern & pattern) { data[pattern] = ValueType(); } //singular insert required by PatternStore, implies 'default' ValueType, usually 0
794 
795  bool has(const Pattern & pattern) const {
796  return data.count(pattern);
797  }
798  bool has(const PatternPointer & pattern) const { return data.count(pattern); }
799 
800  size_t size() const { return data.size(); }
801  void reserve(size_t s) { data.reserve(s); }
802 
803 
804  ValueType& operator [](const Pattern & pattern) { return data[pattern]; }
805  ValueType& operator [](const PatternPointer & pattern) { return data[pattern]; }
806 
807  typedef typename std::unordered_map<Pattern,ValueType>::iterator iterator;
808  typedef typename std::unordered_map<Pattern,ValueType>::const_iterator const_iterator;
809 
810  iterator begin() { return data.begin(); }
811  const_iterator begin() const { return data.begin(); }
812 
813  iterator end() { return data.end(); }
814  const_iterator end() const { return data.end(); }
815 
816  iterator find(const Pattern & pattern) { return data.find(pattern); }
817  const_iterator find(const Pattern & pattern) const { return data.find(pattern); }
818  iterator find(const PatternPointer & pattern) { return data.find(pattern); }
819  const_iterator find(const PatternPointer & pattern) const { return data.find(pattern); }
820 
821  bool erase(const Pattern & pattern) { return data.erase(pattern); }
822  iterator erase(const_iterator position) { return data.erase(position); }
823 
824 };
825 
826 
827 template<class ValueType, class ValueHandler = BaseValueHandler<ValueType>, class ReadWriteSizeType = uint64_t>
828 class PatternPointerMap: public PatternMapStore<std::unordered_map<PatternPointer,ValueType>,ValueType,ValueHandler,ReadWriteSizeType,PatternPointer> {
829  protected:
830  std::unordered_map<PatternPointer, ValueType> data;
831  public:
833  //PatternMap(): PatternMapStore<std::unordered_map<const Pattern, ValueType>,ValueType,ValueHandler,ReadWriteSizeType>() {};
835  this->corpus = corpus;
836  };
837 
839 
840 
841  void insert(const PatternPointer & pattern, ValueType & value) {
842  data[pattern] = value;
843  }
844 
845  void insert(const PatternPointer & pattern) { data[pattern] = ValueType(); } //singular insert required by PatternStore, implies 'default' ValueType, usually 0
846 
847  bool has(const Pattern & pattern) const {
848  return data.count(pattern);
849  }
850  bool has(const PatternPointer & pattern) const { return data.count(pattern); }
851 
852  size_t size() const { return data.size(); }
853  void reserve(size_t s) { data.reserve(s); }
854 
855 
856  ValueType& operator [](const Pattern & pattern) { return data[pattern]; }
857  ValueType& operator [](const PatternPointer & pattern) { return data[pattern]; }
858 
859  typedef typename std::unordered_map<PatternPointer,ValueType>::iterator iterator;
860  typedef typename std::unordered_map<PatternPointer,ValueType>::const_iterator const_iterator;
861 
862  iterator begin() { return data.begin(); }
863  const_iterator begin() const { return data.begin(); }
864 
865  iterator end() { return data.end(); }
866  const_iterator end() const { return data.end(); }
867 
868  iterator find(const Pattern & pattern) {
869  PatternPointer pp = pattern.getpointer();
870  return data.find(pp);
871  }
872  //const_iterator find(const Pattern & pattern) const { return data.find(pattern); }
873 
874  iterator find(const PatternPointer & pattern) { return data.find(pattern); }
875  const_iterator find(const PatternPointer & pattern) const { return data.find(pattern); }
876 
877  bool erase(const PatternPointer & pattern) { return data.erase(pattern); }
878  iterator erase(const_iterator position) { return data.erase(position); }
879 };
880 
881 template<class ValueType, class ValueHandler = BaseValueHandler<ValueType>, class ReadWriteSizeType = uint64_t>
882 class OrderedPatternPointerMap: public PatternMapStore<std::map<PatternPointer,ValueType>,ValueType,ValueHandler,ReadWriteSizeType,PatternPointer> {
883  protected:
884  std::map<PatternPointer, ValueType> data;
885  public:
887  //PatternMap(): PatternMapStore<std::unordered_map<const Pattern, ValueType>,ValueType,ValueHandler,ReadWriteSizeType>() {};
889  this->corpus = corpus;
890  };
891 
893 
894 
895  void insert(const PatternPointer & pattern, ValueType & value) {
896  data[pattern] = value;
897  }
898 
899  void insert(const PatternPointer & pattern) { data[pattern] = ValueType(); } //singular insert required by PatternStore, implies 'default' ValueType, usually 0
900 
901  bool has(const Pattern & pattern) const {
902  return data.count(pattern);
903  }
904  bool has(const PatternPointer & pattern) const { return data.count(pattern); }
905 
906  size_t size() const { return data.size(); }
907  void reserve(size_t s) { } //noop
908 
909 
910  ValueType& operator [](const Pattern & pattern) { return data[pattern]; }
911  ValueType& operator [](const PatternPointer & pattern) { return data[pattern]; }
912 
913  typedef typename std::map<PatternPointer,ValueType>::iterator iterator;
914  typedef typename std::map<PatternPointer,ValueType>::const_iterator const_iterator;
915 
916  iterator begin() { return data.begin(); }
917  const_iterator begin() const { return data.begin(); }
918 
919  iterator end() { return data.end(); }
920  const_iterator end() const { return data.end(); }
921 
922  iterator find(const Pattern & pattern) {
923  PatternPointer pp = pattern.getpointer();
924  return data.find(pp);
925  }
926  //const_iterator find(const Pattern & pattern) const { return data.find(pattern); }
927 
928  iterator find(const PatternPointer & pattern) { return data.find(pattern); }
929  const_iterator find(const PatternPointer & pattern) const { return data.find(pattern); }
930 
931  bool erase(const PatternPointer & pattern) { return data.erase(pattern); }
932  iterator erase(const_iterator position) { return data.erase(position); }
933 };
934 
935 template<class ValueType,class ValueHandler = BaseValueHandler<ValueType>,class ReadWriteSizeType = uint64_t>
936 class HashOrderedPatternMap: public PatternMapStore<std::map<const Pattern,ValueType>,ValueType,ValueHandler,ReadWriteSizeType,Pattern> {
937  protected:
938  std::map<const Pattern, ValueType> data;
939  public:
940  HashOrderedPatternMap<ValueType,ValueHandler,ReadWriteSizeType>(): PatternMapStore<std::map<const Pattern, ValueType>,ValueType,ValueHandler,ReadWriteSizeType>() {};
942 
943  void insert(const Pattern & pattern, ValueType & value) {
944  data[pattern] = value;
945  }
946 
947  void insert(const Pattern & pattern) { data[pattern] = ValueType(); } //singular insert required by PatternStore, implies 'default' ValueType
948 
949  bool has(const Pattern & pattern) const { return data.count(pattern); }
950  bool has(const PatternPointer & pattern) const { return data.count(pattern); }
951 
952  size_t size() const { return data.size(); }
953  void reserve(size_t s) {} //noop
954 
955  ValueType& operator [](const Pattern & pattern) { return data[pattern]; }
956  ValueType& operator [](const PatternPointer & pattern) { return data[pattern]; }
957 
958  typedef typename std::map<const Pattern,ValueType>::iterator iterator;
959  typedef typename std::map<const Pattern,ValueType>::const_iterator const_iterator;
960 
961  iterator begin() { return data.begin(); }
962  const_iterator begin() const { return data.begin(); }
963 
964  iterator end() { return data.end(); }
965  const_iterator end() const { return data.end(); }
966 
967  iterator find(const Pattern & pattern) { return data.find(pattern); }
968  const_iterator find(const Pattern & pattern) const { return data.find(pattern); }
969  iterator find(const PatternPointer & pattern) { return data.find(pattern); }
970  const_iterator find(const PatternPointer & pattern) const { return data.find(pattern); }
971 
972  bool erase(const Pattern & pattern) { return data.erase(pattern); }
973  iterator erase(const_iterator position) { return data.erase(position); }
974 
975 
976 };
977 
978 
979 template<class T,size_t N,int countindex = 0>
981  public:
982  const static bool indexed = false;
983  virtual std::string id() { return "ArrayValueHandler"; }
984  void read(std::istream * in, std::array<T,N> & a) {
985  for (int i = 0; i < N; i++) {
986  T v;
987  in->read( (char*) &v, sizeof(T));
988  a[i] = v;
989  }
990  }
991  void write(std::ostream * out, std::array<T,N> & a) {
992  for (int i = 0; i < N; i++) {
993  T v = a[i];
994  out->write( (char*) &v, sizeof(T));
995  }
996  }
997  std::string tostring(std::array<T,N> & a) {
998  std::string s;
999  for (int i = 0; i < N; i++) {
1000  T v = a[i];
1001  if (!s.empty()) s += " ";
1002  s += " " + tostring(a[i]);
1003  }
1004  return s;
1005  }
1006  unsigned int count(std::array<T,N> & a) const {
1007  return (int) a[countindex];
1008  }
1009  void add(std::array<T,N> * value, const IndexReference & ref ) const {
1010  (*value)[countindex] += 1;
1011  }
1012 };
1013 
1014 
1018 template<class PatternStoreType>
1019 class PatternStoreValueHandler: public AbstractValueHandler<PatternStoreType> {
1020  public:
1021  const static bool indexed = false;
1022  virtual std::string id() { return "PatternStoreValueHandler"; }
1023  void read(std::istream * in, PatternStoreType & value) {
1024  value.read(in);
1025  }
1026  void write(std::ostream * out, PatternStoreType & value) {
1027  value.write(out);
1028  }
1029  virtual std::string tostring( PatternStoreType & value) {
1030  std::cerr << "PatternStoreValueHandler::tostring() is not supported" << std::endl;
1031  throw InternalError();
1032  }
1033  unsigned int count( PatternStoreType & value) const {
1034  return value.size();
1035  }
1036  void add( PatternStoreType * value, const IndexReference & ref ) const {
1037  std::cerr << "PatternStoreValueHandler::add() is not supported" << std::endl;
1038  throw InternalError();
1039  }
1040 };
1041 
1046 template<class ValueType,class ValueHandler=BaseValueHandler<ValueType>, class NestedSizeType = uint16_t >
1047 class AlignedPatternMap: public PatternMap< PatternMap<ValueType,ValueHandler,NestedSizeType>,PatternStoreValueHandler<PatternMap<ValueType,ValueHandler,NestedSizeType>>, uint64_t > {
1048  public:
1051  typedef typename PatternMap< PatternMap<ValueType,ValueHandler,NestedSizeType>,PatternStoreValueHandler<PatternMap<ValueType,ValueHandler,NestedSizeType>>, uint64_t >::const_iterator const_iterator;
1052 
1053 };
1054 
1055 
1056 //TODO: Implement a real Trie, conserving more memory
1057 #endif
iterator erase(const_iterator position)
Definition: patternstore.h:741
unsigned char * corpusstart
Definition: patternstore.h:326
const_iterator find(const Pattern &pattern) const
Definition: patternstore.h:617
std::unordered_map< Pattern, ValueType > data
Definition: patternstore.h:784
std::unordered_map< std::string, unsigned int >::const_iterator const_iterator
Definition: classencoder.h:222
iterator find(const Pattern &pattern)
Definition: patternstore.h:735
ContainerType::const_iterator const_iterator
Definition: patternstore.h:422
bool erase(const PatternPointer &pattern)
Definition: patternstore.h:877
t_patternset::const_iterator const_iterator
Definition: patternstore.h:601
void insert(const PatternPointer &pattern)
Definition: patternstore.h:899
virtual void attachcorpus(const IndexedCorpus &corpus)
Definition: patternstore.h:338
iterator self_type
Definition: patternstore.h:158
PatternMap< PatternMap< ValueType, ValueHandler, NestedSizeType >, PatternStoreValueHandler< PatternMap< ValueType, ValueHandler, NestedSizeType > >, uint64_t >::iterator iterator
Definition: patternstore.h:1050
size_t size() const
Definition: patternstore.h:597
void reserve(size_t s)
Definition: patternstore.h:953
bool erase(const Pattern &pattern)
Definition: patternstore.h:625
virtual bool has(const Pattern &) const =0
void read(std::istream *in, int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true, bool DORESET=false, bool DEBUG=false)
Definition: patternstore.h:457
bool erase(const Pattern &pattern)
Definition: patternstore.h:821
const_iterator begin() const
Definition: patternstore.h:917
unsigned char * corpus
Definition: patternstore.h:46
size_t size() const
Definition: patternstore.h:952
iterator(const self_type &ref)
Definition: patternstore.h:164
unsigned int count(std::array< T, N > &a) const
Definition: patternstore.h:1006
unsigned int inttobytes(unsigned char *buffer, unsigned int cls)
Definition: classencoder.cpp:22
void write(std::ostream *out)
Definition: patternstore.h:744
virtual PatternStoreInterface * getstoreinterface()
Definition: patternstore.h:381
bool has(const Pattern &pattern) const
Definition: patternstore.h:949
unsigned int corpussize
Definition: patternstore.h:47
iterator find(const Pattern &pattern)
Definition: patternstore.h:816
bool empty() const
Definition: patternstore.h:270
void reserve(size_t s)
Definition: patternstore.h:907
iterator find(const Pattern &pattern)
Definition: patternstore.h:967
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:950
ValueType & operator[](const Pattern &pattern)
Definition: patternstore.h:910
virtual void reserve(size_t)=0
const_iterator find(const Pattern &pattern) const
Definition: patternstore.h:968
std::vector< IndexReference > findpattern(const Pattern &pattern, uint32_t sentence=0, int maxmatches=0)
Definition: pattern.cpp:1787
void write(std::ostream *out, PatternStoreType &value)
Definition: patternstore.h:1026
ValueType & operator[](const Pattern &pattern)
Definition: patternstore.h:804
void insert(const Pattern &pattern)
Definition: patternstore.h:947
Definition: pattern.h:357
std::string tostring(std::array< T, N > &a)
Definition: patternstore.h:997
virtual ContainerType::iterator end()=0
Definition: pattern.h:54
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:798
iterator find(const Pattern &pattern)
Definition: patternstore.h:922
void add(std::array< T, N > *value, const IndexReference &ref) const
Definition: patternstore.h:1009
pointer operator->()
Definition: patternstore.h:201
Classes for data types and handlers for those data types.
unsigned int count(PatternStoreType &value) const
Definition: patternstore.h:1033
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
iterator find(const IndexReference &ref)
Definition: patternstore.h:235
std::map< const Pattern, ValueType > data
Definition: patternstore.h:938
unsigned int sentences() const
Definition: patternstore.h:150
iterator begin()
Definition: patternstore.h:961
Definition: patternstore.h:156
void insert(const PatternPointer &pattern, ValueType &value)
Definition: patternstore.h:841
iterator end()
Definition: patternstore.h:813
bool operator!=(self_type rhs)
Definition: patternstore.h:203
iterator end()
Definition: patternstore.h:919
virtual size_t size() const =0
void add(PatternStoreType *value, const IndexReference &ref) const
Definition: patternstore.h:1036
size_t size() const
Definition: patternstore.h:723
std::map< PatternPointer, ValueType >::iterator iterator
Definition: patternstore.h:913
void reserve(size_t s)
Definition: patternstore.h:801
const size_t n() const
Definition: pattern.cpp:93
virtual bool has(const Pattern &) const =0
const_iterator end() const
Definition: patternstore.h:920
ValueType & operator[](const Pattern &pattern)
Definition: patternstore.h:955
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:722
void insert(const Pattern &pattern, ValueType &value)
Definition: patternstore.h:943
Abstract value handler class, all value handlers are derived from this. Value handlers are interfaces...
Definition: datatypes.h:168
bool operator==(self_type rhs)
Definition: patternstore.h:202
void reserve(size_t s)
Definition: patternstore.h:598
std::unordered_map< Pattern, ValueType >::iterator iterator
Definition: patternstore.h:807
PatternCategory
Definition: pattern.h:52
void reserve(size_t s)
Definition: patternstore.h:853
unsigned int totaltokens
Definition: patternstore.h:49
PatternMap< ValueType, ValueHandler, NestedSizeType > valuetype
Definition: patternstore.h:1049
ContainerType::iterator iterator
Definition: patternstore.h:421
t_hashorderedpatternset::iterator iterator
Definition: patternstore.h:726
iterator begin()
Definition: patternstore.h:916
void next()
Definition: patternstore.h:187
int patterntype
Definition: patternstore.h:329
std::unordered_map< unsigned int, std::string >::const_iterator const_iterator
Definition: classdecoder.h:137
Limited virtual interface to pattern stores.
Definition: interface.h:20
unsigned char * getpointer(const IndexReference &begin) const
Definition: pattern.cpp:1735
std::unordered_map< Pattern, ValueType >::const_iterator const_iterator
Definition: patternstore.h:808
bool erase(const Pattern &pattern)
Definition: patternstore.h:740
void read(std::string filename, int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true, bool DORESET=false, bool DEBUG=false)
Definition: patternstore.h:514
size_t size() const
Definition: patternstore.h:906
std::unordered_map< PatternPointer, ValueType >::const_iterator const_iterator
Definition: patternstore.h:860
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:819
Definition: common.h:43
bool has(const Pattern &pattern) const
Definition: patternstore.h:721
Class for reading an entire (class encoded) corpus into memory. It provides a reverse index by IndexR...
Definition: patternstore.h:44
void load(std::istream *in, bool debug=false)
Definition: pattern.cpp:1687
std::map< PatternPointer, ValueType > data
Definition: patternstore.h:884
IndexedCorpus(unsigned char *corpus, unsigned int corpussize)
Definition: patternstore.h:62
bool erase(const Pattern &pattern)
Definition: patternstore.h:972
virtual ContainerType::iterator find(const Pattern &pattern)=0
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:620
const PatternCategory category() const
Definition: pattern.cpp:42
std::unordered_map< PatternPointer, ValueType >::iterator iterator
Definition: patternstore.h:859
std::forward_iterator_tag iterator_category
Definition: patternstore.h:162
virtual std::string id()
Definition: patternstore.h:983
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping...
Definition: classdecoder.h:43
std::map< const Pattern, ValueType >::const_iterator const_iterator
Definition: patternstore.h:959
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:592
Reference to a position in the corpus.
Definition: datatypes.h:33
void insert(const PatternPointer &pattern, ValueType &value)
Definition: patternstore.h:895
void insert(const Pattern pattern)
Definition: patternstore.h:717
t_hashorderedpatternset data
Definition: patternstore.h:711
IndexedCorpus()
Definition: patternstore.h:52
iterator erase(const_iterator position)
Definition: patternstore.h:878
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:738
A pattern map storing patterns and their values in a hash map (unordered_map).
Definition: patternstore.h:782
PatternPointer * patternpointer
Definition: patternstore.h:48
void insert(const Pattern &pattern, ValueType &value)
Definition: patternstore.h:789
void read(std::istream *in, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:644
IndexPattern & reference
Definition: patternstore.h:160
std::unordered_set< Pattern > t_patternset
Definition: patternstore.h:528
unsigned char classencodingversion
Definition: patternstore.h:328
const_iterator end() const
Definition: patternstore.h:866
iterator end()
Definition: patternstore.h:609
self_type operator++(int junk)
Definition: patternstore.h:199
Definition: pattern.h:56
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:818
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:850
static const bool indexed
Definition: patternstore.h:1021
virtual ContainerType::iterator begin()=0
unsigned int bytestoint(const unsigned char *a, unsigned int *length)
Definition: classdecoder.cpp:20
const_iterator end() const
Definition: patternstore.h:610
~IndexedCorpus()
Definition: patternstore.h:80
int difference_type
Definition: patternstore.h:163
void write(std::ostream *out)
Definition: patternstore.h:632
bool has(const IndexReference &ref) const
Definition: patternstore.h:254
iterator begin()
Definition: patternstore.h:862
PatternPointer getpointer() const
Definition: pattern.cpp:1835
iterator end()
Definition: patternstore.h:865
A complex value handler for values that are themselves pattern stores (allows building nested maps)...
Definition: patternstore.h:1019
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:929
const_iterator end() const
Definition: patternstore.h:965
const_iterator find(const Pattern &pattern) const
Definition: patternstore.h:817
virtual bool erase(const PatternType &)=0
bool has(const Pattern &pattern) const
Definition: patternstore.h:901
virtual void reserve(size_t)=0
virtual ValueType & operator[](const Pattern &pattern)=0
const size_t size() const
Definition: pattern.h:156
void insert(const PatternPointer &pattern)
Definition: patternstore.h:845
virtual void write(std::ostream *out)=0
const_iterator begin() const
Definition: patternstore.h:730
unsigned int getcorpussize() const
Definition: patternstore.h:349
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:619
virtual void detachcorpus()
Definition: patternstore.h:342
virtual void insert(const PatternType &pattern)=0
Definition: patternstore.h:882
iterator end()
Definition: patternstore.h:224
Definition: patternstore.h:980
IndexedCorpus * corpus
Definition: patternstore.h:886
static const bool indexed
Definition: patternstore.h:982
const_iterator begin() const
Definition: patternstore.h:863
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:737
virtual void write(std::string filename)
Definition: patternstore.h:445
iterator erase(const_iterator position)
Definition: patternstore.h:626
IndexPattern * pointer
Definition: patternstore.h:161
std::pair< IndexReference, PatternPointer > IndexPattern
Definition: patternstore.h:39
static const unsigned char delimiterclass
Definition: classdecoder.h:48
int sentencelength(int sentence) const
Definition: pattern.cpp:1806
virtual size_t size() const =0
PatternMap< PatternMap< ValueType, ValueHandler, NestedSizeType >, PatternStoreValueHandler< PatternMap< ValueType, ValueHandler, NestedSizeType > >, uint64_t >::const_iterator const_iterator
Definition: patternstore.h:1051
std::map< PatternPointer, ValueType >::const_iterator const_iterator
Definition: patternstore.h:914
void read(std::istream *in, int MINLENGTH=0, int MAXLENGTH=999999, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:753
ContainerType::const_iterator const_iterator
Definition: patternstore.h:371
bool erase(const PatternPointer &pattern)
Definition: patternstore.h:931
size_t size() const
Definition: patternstore.h:800
self_type operator++()
Definition: patternstore.h:182
iterator begin()
Definition: patternstore.h:810
Abstract Pattern store class, not to be instantiated directly.
Definition: patternstore.h:324
void insert(const Pattern &pattern)
Definition: patternstore.h:793
virtual std::string id()
Definition: patternstore.h:1022
ContainerType::iterator iterator
Definition: patternstore.h:370
unsigned int operator[](const IndexReference &ref)
Definition: patternstore.h:279
std::map< const Pattern, ValueType >::iterator iterator
Definition: patternstore.h:958
const_iterator end() const
Definition: patternstore.h:814
iterator begin()
Definition: patternstore.h:729
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:970
std::map< uint32_t, unsigned char * > sentenceindex
Definition: patternstore.h:50
virtual size_t size() const =0
iterator end()
Definition: patternstore.h:964
IndexedCorpus * corpus
Definition: patternstore.h:832
t_patternset data
Definition: patternstore.h:540
void read(std::istream *in, PatternStoreType &value)
Definition: patternstore.h:1023
const_iterator end() const
Definition: patternstore.h:733
unsigned int bytesize() const
Definition: patternstore.h:118
unsigned char * getcorpus() const
Definition: patternstore.h:346
iterator erase(const_iterator position)
Definition: patternstore.h:973
virtual ContainerType::iterator end()=0
virtual ContainerType::iterator begin()=0
const_iterator find(const PatternPointer &pattern) const
Definition: patternstore.h:875
Class for encoding plain-text to binary class-encoded data. The ClassEncoder maintains a mapping of w...
Definition: classencoder.h:50
PatternPointer getpattern() const
Definition: patternstore.h:112
PatternPointer getsentence(int sentence) const
Definition: pattern.cpp:1826
void write(std::ostream *out, const unsigned char *corpusstart=NULL) const
Definition: pattern.cpp:228
bool has(const Pattern &pattern) const
Definition: patternstore.h:847
iterator erase(const_iterator position)
Definition: patternstore.h:822
iterator find(const Pattern &pattern)
Definition: patternstore.h:616
virtual void insert(const PatternType &pattern, ValueType &value)=0
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:874
iterator begin()
Definition: patternstore.h:214
iterator(pointer ptr)
Definition: patternstore.h:167
const_iterator find(const Pattern &pattern) const
Definition: patternstore.h:736
unsigned char * data
Definition: pattern.h:360
size_t size()
Definition: patternstore.h:261
Definition: patternstore.h:936
Class for encoding plain-text to binary class-encoded data.
A nested pattern map, useful for storing patterns that map to other patterns, which in turn map to va...
Definition: patternstore.h:1047
virtual void write(std::ostream *out)
Definition: patternstore.h:432
iterator end()
Definition: patternstore.h:732
iterator begin()
Definition: patternstore.h:606
void debug()
Definition: patternstore.h:204
A pattern store in the form of an unordered set (i.e, no duplicates). Stores only patterns...
Definition: patternstore.h:538
virtual ContainerType::iterator find(const Pattern &pattern)=0
void read(std::istream *in, std::array< T, N > &a)
Definition: patternstore.h:984
Definition: common.h:35
unsigned char * beginpointer() const
Definition: patternstore.h:115
virtual std::string tostring(PatternStoreType &value)
Definition: patternstore.h:1029
Basic largely trivial functions for the common good.
const_iterator begin() const
Definition: patternstore.h:962
ValueHandler valuehandler
Definition: patternstore.h:400
reference operator*()
Definition: patternstore.h:200
~iterator()
Definition: patternstore.h:179
PatternType
Definition: pattern.h:59
iterator()
Definition: patternstore.h:176
virtual void use_v1_format()
Definition: patternstore.h:357
bool has(const PatternPointer &pattern) const
Definition: patternstore.h:904
std::unordered_map< PatternPointer, ValueType > data
Definition: patternstore.h:830
Abstract class for map-like pattern stores, do not instantiate directly.
Definition: patternstore.h:398
ValueType & operator[](const Pattern &pattern)
Definition: patternstore.h:856
iterator find(const Pattern &pattern)
Definition: patternstore.h:868
void reserve(size_t s)
Definition: patternstore.h:724
Definition: patternstore.h:828
virtual void attachcorpus(unsigned char *corpusstart, unsigned int corpussize)
Definition: patternstore.h:334
t_hashorderedpatternset::const_iterator const_iterator
Definition: patternstore.h:727
iterator(IndexReference iref, PatternPointer pp)
Definition: patternstore.h:170
const_iterator begin() const
Definition: patternstore.h:811
Contains the Pattern class that is ubiquitous throughout Colibri Core.
pointer pairpointer
Definition: patternstore.h:208
void insert(const Pattern &pattern)
Definition: patternstore.h:580
Class for decoding binary class-encoded data back to plain-text.
void write(std::ostream *out, std::array< T, N > &a)
Definition: patternstore.h:991
virtual bool has(const Pattern &) const =0
IndexPattern value_type
Definition: patternstore.h:159
size_t size() const
Definition: patternstore.h:852
bool has(const Pattern &pattern) const
Definition: patternstore.h:587
void readmap(std::istream *in, int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:666
unsigned int corpussize
Definition: patternstore.h:327
virtual bool erase(const PatternType &)=0
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:928
iterator find(const PatternPointer &pattern)
Definition: patternstore.h:969
t_patternset::iterator iterator
Definition: patternstore.h:600
iterator(reference ref)
Definition: patternstore.h:173
Definition: pattern.h:55
std::set< Pattern > t_hashorderedpatternset
Definition: patternstore.h:698
iterator erase(const_iterator position)
Definition: patternstore.h:932
bool has(const Pattern &pattern) const
Definition: patternstore.h:795
const_iterator begin() const
Definition: patternstore.h:607
A pattern store in the form of an ordered set (i.e, no duplicates). Stores only patterns, no values.
Definition: patternstore.h:709