Colibri Core
patternmodel.h
Go to the documentation of this file.
1 #ifndef PATTERNMODEL_H
2 #define PATTERNMODEL_H
3 
4 /*****************************
5 * Colibri Core
6 * by Maarten van Gompel
7 * Centre for Language Studies
8 * Radboud University Nijmegen
9 *
10 * http://proycon.github.io/colibri-core
11 *
12 * Licensed under GPLv3
13 *****************************/
14 
55 #include "patternstore.h"
56 #include "classencoder.h"
57 #include "algorithms.h"
58 #include <limits>
59 #include <cmath>
60 #include <cstdint>
61 #include <map>
62 #include <set>
63 #include <sstream>
64 #include <array>
65 #include <exception>
66 #include "bz2stream.h"
67 
68 
72 enum ModelType {
79 };
80 
81 
86  NONE = 0,
87  QUICK = 1,
88  COMPACT = 2,
89 };
90 
94 int getmodeltype(const std::string & filename);
95 
96 
97 class NoSuchPattern: public std::exception {
98  virtual const char* what() const throw()
99  {
100  return "Pattern not found in model";
101  }
102 };
103 
104 
112  public:
113  int MINTOKENS;
114 
117 
122  //
125  int MINLENGTH;
126  int MAXLENGTH;
128 
134  bool DOSKIPGRAMS;
137  int MAXSKIPS;
138 
141 
142  int PRUNENONSUBSUMED; //< Prune all n-grams that are not subsumed by higher-order ngrams
143 
148  bool DORESET;
149 
150  bool QUIET;
151  bool DEBUG;
152 
158  MINTOKENS = -1; //defaults to 2 for building, 1 for loading
159  MINTOKENS_SKIPGRAMS = -1; //defaults to MINTOKENS
160  MINTOKENS_UNIGRAMS = 1; //defaults to, effectively disabled
161  MINLENGTH = 1;
162  MAXLENGTH = 100;
163  MAXBACKOFFLENGTH = 100;
164 
165  MINSKIPTYPES = 2;
166  MAXSKIPS = 3;
167  DOSKIPGRAMS = false;
168  DOSKIPGRAMS_EXHAUSTIVE = false;
169 
170  DOREVERSEINDEX = true; //obsolete
171  DOPATTERNPERLINE = false;
172  DORESET = false;
173 
174  DOREMOVEINDEX = false; //only for indexed models
175  DOREMOVENGRAMS = false;
176  DOREMOVESKIPGRAMS = false;
177  DOREMOVEFLEXGRAMS = false;
178 
179  PRUNENONSUBSUMED = false;
180 
181  DEBUG = false;
182  QUIET = false;
183  }
184 
189  MINTOKENS = ref.MINTOKENS; //defaults to 2 for building, 1 for loading
190  MINTOKENS_UNIGRAMS = ref.MINTOKENS_UNIGRAMS;
191  MINTOKENS_SKIPGRAMS = ref.MINTOKENS_SKIPGRAMS; //defaults to 2 for building, 1 for loading
192  MINLENGTH = ref.MINLENGTH;
193  MAXLENGTH = ref.MAXLENGTH;
194  MAXBACKOFFLENGTH = ref.MAXBACKOFFLENGTH;
195 
196  MINSKIPTYPES = ref.MINSKIPTYPES;
197  MAXSKIPS = ref.MAXSKIPS;
198  DOSKIPGRAMS = ref.DOSKIPGRAMS;
199  DOSKIPGRAMS_EXHAUSTIVE = ref.DOSKIPGRAMS_EXHAUSTIVE;
200 
201  DOREVERSEINDEX = ref.DOREVERSEINDEX;
202  DOPATTERNPERLINE = ref.DOPATTERNPERLINE;
203  DORESET = ref.DORESET;
204 
205  DOREMOVEINDEX = ref.DOREMOVEINDEX; //only for indexed models
206  DOREMOVENGRAMS = ref.DOREMOVENGRAMS;
207  DOREMOVESKIPGRAMS = ref.DOREMOVESKIPGRAMS;
208  DOREMOVEFLEXGRAMS = ref.DOREMOVEFLEXGRAMS;
209 
210  PRUNENONSUBSUMED = ref.PRUNENONSUBSUMED;
211 
212  DEBUG = ref.DEBUG;
213  QUIET = ref.QUIET;
214  }
215 
216 
217 };
218 
231 
232 typedef PatternMap<uint32_t,BaseValueHandler<uint32_t>,uint64_t>::iterator t_relationmap_iterator; //needed for Cython
234 
239  public:
244  virtual int getmodeltype() const=0;
245 
249  virtual int getmodelversion() const=0;
250 
251  //these are already in PatternStoreInterface:
252  //virtual bool has(const Pattern &) const =0;
253  //virtual bool has(const PatternPointer &) const =0;
254  //virtual size_t size() const =0;
255 
259  virtual unsigned int occurrencecount(const Pattern & pattern)=0;
260 
265  virtual double frequency(const Pattern &) =0;
266 
270  virtual int maxlength() const=0;
274  virtual int minlength() const=0;
275 
280  virtual unsigned int types() =0;
281 
286  virtual unsigned int tokens() const=0;
287 
289  return (PatternStoreInterface*) this;
290  };
291 };
292 
293 
294 
299 class PatternSetModel: public PatternSet<uint64_t>, public PatternModelInterface {
300  protected:
301  unsigned char model_type;
302  unsigned char model_version;
303  uint64_t totaltokens; //INCLUDES TOKENS NOT COVERED BY THE MODEL!
304  uint64_t totaltypes; //TOTAL UNIGRAM TYPES, INCLUDING NOT COVERED BY THE MODEL!
305  int maxn;
306  int minn;
307  public:
312  totaltokens = 0;
313  totaltypes = 0;
314  maxn = 0;
315  minn = 999;
316  model_type = this->getmodeltype();
317  model_version = this->getmodelversion();
318  }
319 
325  PatternSetModel(std::istream *f, PatternModelOptions options, PatternModelInterface * constrainmodel = NULL) {
326  totaltokens = 0;
327  totaltypes = 0;
328  maxn = 0;
329  minn = 999;
330  model_type = this->getmodeltype();
331  model_version = this->getmodelversion();
332  this->load(f,options, constrainmodel);
333  }
334 
341  PatternSetModel(const std::string & filename, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) {
342  totaltokens = 0;
343  totaltypes = 0;
344  maxn = 0;
345  minn = 999;
346  model_type = this->getmodeltype();
347  model_version = this->getmodelversion();
348  if (!options.QUIET) std::cerr << "Loading " << filename << std::endl;
349  std::ifstream * in = new std::ifstream(filename.c_str());
350  if (!in->good()) {
351  std::cerr << "ERROR: Unable to load file " << filename << std::endl;
352  throw InternalError();
353  }
354  this->load( (std::istream *) in, options, constrainmodel);
355  in->close();
356  delete in;
357  }
358  virtual int getmodeltype() const { return PATTERNSETMODEL; }
359  virtual int getmodelversion() const { return 2; }
360 
361  virtual size_t size() const {
363  }
364  virtual bool has(const Pattern & pattern) const {
365  return PatternSet<uint64_t>::has(pattern);
366  }
367  virtual bool has(const PatternPointer & pattern) const {
368  return PatternSet<uint64_t>::has(pattern);
369  }
370 
377  virtual void load(std::string & filename, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) {
378  if (!options.QUIET) std::cerr << "Loading " << filename << " as set-model" << std::endl;
379  std::ifstream * in = new std::ifstream(filename.c_str());
380  if (!in->good()) {
381  std::cerr << "ERROR: Unable to load file " << filename << std::endl;
382  throw InternalError();
383  }
384  this->load( (std::istream *) in, options, constrainmodel);
385  in->close();
386  delete in;
387  }
388 
394  virtual void load(std::istream * f, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) { //load from file
395  char null;
396  f->read( (char*) &null, sizeof(char));
397  f->read( (char*) &model_type, sizeof(char));
398  f->read( (char*) &model_version, sizeof(char));
399  if (model_version == 1) this->classencodingversion = 1;
400  if ((null != 0) || ((model_type != UNINDEXEDPATTERNMODEL) && (model_type != INDEXEDPATTERNMODEL) && (model_type != PATTERNSETMODEL) && (model_type != PATTERNALIGNMENTMODEL) )) {
401  std::cerr << "ERROR: File is not a colibri patternmodel file" << std::endl;
402  throw InternalError();
403  }
404  if (model_version > 2) {
405  std::cerr << "WARNING: Model is created with a newer version of Colibri Core! Attempting to continue but failure is likely..." << std::endl;
406  }
407  f->read( (char*) &totaltokens, sizeof(uint64_t));
408  f->read( (char*) &totaltypes, sizeof(uint64_t));
409 
410  PatternStoreInterface * constrainstore = NULL;
411  if (constrainmodel) constrainstore = constrainmodel->getstoreinterface();
412 
413  if (options.DEBUG) {
414  std::cerr << "Debug enabled, loading PatternModel type " << (int) model_type << ", version " << (int) model_version << ", classencodingversion" << (int) this->classencodingversion << std::endl;
415  std::cerr << "Total tokens: " << totaltokens << ", total types: " << totaltypes << std::endl;;
416  }
417  if (model_type == PATTERNSETMODEL) {
418  //reading set
419  PatternSet<uint64_t>::read(f, options.MINLENGTH, options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS); //read PatternStore
420  } else if (model_type == INDEXEDPATTERNMODEL) {
421  //reading from indexed pattern model, ok:
422  readmap<IndexedData,IndexedDataHandler>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS);
423  } else if (model_type == UNINDEXEDPATTERNMODEL) {
424  //reading from unindexed pattern model, ok:
425  readmap<uint32_t,BaseValueHandler<uint32_t>>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS);
426  } else if (model_type == PATTERNALIGNMENTMODEL) {
427  //ok:
428  readmap<PatternFeatureVectorMap<double>, PatternFeatureVectorMapHandler<double>>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS);
429  } else {
430  std::cerr << "ERROR: Unknown model type" << std::endl;
431  throw InternalError();
432  }
433  }
434 
438  void write(std::ostream * out) {
439  const char null = 0;
440  out->write( (char*) &null, sizeof(char));
441  unsigned char t = this->getmodeltype();
442  out->write( (char*) &t, sizeof(char));
443  unsigned char v = this->getmodelversion();
444  out->write( (char*) &v, sizeof(char));
445  out->write( (char*) &totaltokens, sizeof(uint64_t));
446  const uint64_t tp = this->types(); //use this instead of totaltypes, as it may need to be computed on-the-fly still
447  out->write( (char*) &tp, sizeof(uint64_t));
448  PatternSet<uint64_t>::write(out); //write
449  }
450 
455  void write(const std::string & filename) {
456  std::ofstream * out = new std::ofstream(filename.c_str());
457  this->write(out);
458  out->close();
459  delete out;
460  }
461 
466  return (PatternModelInterface*) this;
467  }
468 
473  virtual unsigned int occurrencecount(const Pattern & pattern) { return 0; }
474 
479  virtual double frequency(const Pattern &) { return 0; }
480 
483 
487  virtual int maxlength() const { return maxn; };
488 
492  virtual int minlength() const { return minn; };
493 
498  virtual unsigned int types() {
499  return totaltypes;
500  }
505  virtual unsigned int tokens() const { return totaltokens; }
506 
511  unsigned char type() const { return model_type; }
515  unsigned char version() const { return model_version; }
516 };
517 
518 
525 template<class ValueType, class ValueHandler = BaseValueHandler<ValueType>, class MapType = PatternMap<ValueType, BaseValueHandler<ValueType>>, class PatternType = Pattern>
526 class PatternModel: public MapType, public PatternModelInterface {
527  protected:
528  unsigned char model_type;
529  unsigned char model_version;
530  uint64_t totaltokens;
531  uint64_t totaltypes;
532 
533  int maxn;
534  int minn;
535 
536  //std::multimap<IndexReference,Pattern> reverseindex;
537  std::set<int> cache_categories;
538  std::set<int> cache_n;
539  std::map<int,std::map<int,unsigned int>> cache_grouptotal;
540  std::map<int,std::map<int,unsigned int>> cache_grouptotalpatterns ;
541  std::map<int,std::map<int,unsigned int>> cache_grouptotalwordtypes;
542  std::map<int,std::map<int,unsigned int>> cache_grouptotaltokens;
543 
544 
545  std::map<int, std::vector< uint32_t > > gapmasks;
546 
547  virtual void postread(const PatternModelOptions options) {
548  //this function has a specialisation specific to indexed pattern models,
549  //this is the generic version
550  for (iterator iter = this->begin(); iter != this->end(); iter++) {
551  const PatternType p = iter->first;
552  const int n = p.n();
553  if (n > maxn) maxn = n;
554  if (n < minn) minn = n;
555  if ((!hasskipgrams) && (p.isskipgram())) hasskipgrams = true;
556  }
557  }
558  virtual void posttrain(const PatternModelOptions options) {
559  //nothing to do here, indexed model specialised this function to
560  //sort indices
561  }
562  public:
566 
571  totaltokens = 0;
572  totaltypes = 0;
573  maxn = 0;
574  minn = 999;
575  hasskipgrams = false;
576  model_type = this->getmodeltype();
577  model_version = this->getmodelversion();
578  if (corpus) {
579  this->reverseindex = corpus;
580  this->attachcorpus(*corpus);
581  } else {
582  this->reverseindex = NULL;
583  }
584  reverseindex_internal = false;
585  }
586 
594  PatternModel<ValueType,ValueHandler,MapType,PatternType>(std::istream *f, PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL) {
595  totaltokens = 0;
596  totaltypes = 0;
597  maxn = 0;
598  minn = 999;
599  hasskipgrams = false;
600  model_type = this->getmodeltype();
601  model_version = this->getmodelversion();
602  this->load(f,options,constrainmodel);
603  if (corpus) {
604  this->reverseindex = corpus;
605  this->attachcorpus(*corpus);
606  } else {
607  this->reverseindex = NULL;
608  }
609  reverseindex_internal = false;
610  }
611 
613  if (reverseindex_internal && reverseindex != NULL) delete reverseindex;
614  }
622  PatternModel<ValueType,ValueHandler,MapType,PatternType>(const std::string & filename, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL) { //load from file
623  //IndexedPatternModel will overload this
624  totaltokens = 0;
625  totaltypes = 0;
626  maxn = 0;
627  minn = 999;
628  hasskipgrams = false;
629  model_type = this->getmodeltype();
630  model_version = this->getmodelversion();
631  if (corpus) {
632  this->reverseindex = corpus;
633  this->attachcorpus(*corpus);
634  } else {
635  this->reverseindex = NULL;
636  }
637  reverseindex_internal = false;
638  if (!options.QUIET) std::cerr << "Loading " << filename << std::endl;
639  std::ifstream * in = new std::ifstream(filename.c_str());
640  if (!in->good()) {
641  std::cerr << "ERROR: Unable to load file " << filename << std::endl;
642  throw InternalError();
643  }
644  this->load( (std::istream *) in, options, constrainmodel);
645  in->close();
646  delete in;
647  }
648 
649 
653  virtual int getmodeltype() const { return UNINDEXEDPATTERNMODEL; }
657  virtual int getmodelversion() const { return 2; }
658 
662  virtual size_t size() const {
663  return MapType::size();
664  }
665 
669  virtual bool has(const Pattern & pattern) const {
670  return MapType::has(pattern);
671  }
672  virtual bool has(const PatternPointer & pattern) const {
673  return MapType::has(pattern);
674  }
675 
682  virtual void load(std::string & filename, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) {
683  if (!options.QUIET) std::cerr << "Loading " << filename << std::endl;
684  std::ifstream * in = new std::ifstream(filename.c_str());
685  if (!in->good()) {
686  std::cerr << "ERROR: Unable to load file " << filename << std::endl;
687  throw InternalError();
688  }
689  this->load( (std::istream *) in, options, constrainmodel);
690  in->close();
691  delete in;
692  }
693 
700  virtual void load(std::istream * f, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) { //load from file
701  char null;
702  f->read( (char*) &null, sizeof(char));
703  f->read( (char*) &model_type, sizeof(char));
704  f->read( (char*) &model_version, sizeof(char));
705  if (model_version == 1) this->classencodingversion = 1;
706  if ((null != 0) || ((model_type != UNINDEXEDPATTERNMODEL) && (model_type != UNINDEXEDPATTERNPOINTERMODEL) && (model_type != INDEXEDPATTERNMODEL) && (model_type != INDEXEDPATTERNPOINTERMODEL) && (model_type != PATTERNALIGNMENTMODEL) )) {
707  std::cerr << "File is not a colibri model file (or a very old one)" << std::endl;
708  throw InternalError();
709  }
710  if (model_version > 2) {
711  std::cerr << "WARNING: Model is created with a newer version of Colibri Core! Attempting to continue but failure is likely..." << std::endl;
712  }
713  if (options.DEBUG) {
714  std::cerr << "Debug enabled, loading PatternModel type " << (int) model_type << ", version " << (int) model_version << ", classencodingversion=" << (int) this->classencodingversion << std::endl;
715  }
716  if ((model_type == UNINDEXEDPATTERNPOINTERMODEL) || (model_type == INDEXEDPATTERNPOINTERMODEL)) {
717  this->patterntype = PATTERNPOINTER;
718  if (options.DEBUG) std::cerr << "Reading corpus data" << std::endl;
719  unsigned int corpussize;
720  f->read( (char*) &corpussize, sizeof(unsigned int));
721  unsigned char * corpusdata = new unsigned char[corpussize];
722  f->read((char*) corpusdata,sizeof(unsigned char) * corpussize);
723  reverseindex = new IndexedCorpus(corpusdata, corpussize);
724  this->attachcorpus(*reverseindex);
725  reverseindex_internal = true;
726  if (options.DEBUG) std::cerr << "(read " << corpussize << " bytes)" << std::endl;
727  }
728  f->read( (char*) &totaltokens, sizeof(uint64_t));
729  f->read( (char*) &totaltypes, sizeof(uint64_t));
730 
731  PatternStoreInterface * constrainstore = NULL;
732  if (constrainmodel) constrainstore = constrainmodel->getstoreinterface();
733 
734  if (options.DEBUG) {
735  std::cerr << "Total tokens: " << totaltokens << ", total types: " << totaltypes << std::endl;;
736  }
737 
738 
739  if (((model_type == INDEXEDPATTERNMODEL) && (this->getmodeltype() == UNINDEXEDPATTERNMODEL)) || ((model_type == INDEXEDPATTERNPOINTERMODEL) && (this->getmodeltype() == UNINDEXEDPATTERNPOINTERMODEL))) {
740  //reading indexed pattern model as unindexed, (or indexed patternPOINTErmodels as unindexed patternPOINTERmodels)
741  MapType::template read<IndexedData,IndexedDataHandler,PatternType>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET, options.DEBUG);
742  } else if ((model_type == UNINDEXEDPATTERNMODEL) && (this->getmodeltype() == INDEXEDPATTERNMODEL)) {
743  //reading unindexed model as indexed, this will load the patterns but lose all the counts
744  MapType::template read<uint32_t,BaseValueHandler<uint32_t>,PatternType>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET, options.DEBUG);
745  } else if ((model_type == UNINDEXEDPATTERNPOINTERMODEL) && (this->getmodeltype() == UNINDEXEDPATTERNMODEL)) {
746  //reading unindexed pointermodel as unindexed patternmodel
747  MapType::template read<uint32_t,BaseValueHandler<uint32_t>,PatternPointer>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET, options.DEBUG);
748  } else if ((model_type == INDEXEDPATTERNPOINTERMODEL) && ((this->getmodeltype() == INDEXEDPATTERNMODEL) || (this->getmodeltype() == UNINDEXEDPATTERNMODEL))) {
749  //reading indexed patternpointermodel as (un)indexed patternmodel
750  MapType::template read<IndexedData,IndexedDataHandler,PatternPointer>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET, options.DEBUG);
751  } else if (model_type == PATTERNALIGNMENTMODEL) {
752  //reading pattern alignment model as pattern model, can be
753  //done, but semantics change: count corresponds to the number of distinct alignments (for unindexed models)
754  //indexed models will lose all counts
755  MapType::template read<PatternFeatureVectorMap<double>,PatternFeatureVectorMapHandler<double>,PatternType>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS,options.DORESET, options.DEBUG);
756  } else {
757  MapType::template read(f, options.MINTOKENS,options.MINLENGTH, options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET, options.DEBUG); //read PatternStore (also works for reading unindexed pattern models as indexed, which will load patterns but lose the counts)
758  }
759  this->postread(options);
760  }
761 
766  return (PatternModelInterface*) this;
767  }
768 
778  virtual void train(std::istream * in , PatternModelOptions options, PatternModelInterface * constrainbymodel = NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) {
779  if (options.MINTOKENS == -1) options.MINTOKENS = 2;
780  if (options.MINTOKENS == 0) options.MINTOKENS = 1;
781  if (options.MINTOKENS_SKIPGRAMS < options.MINTOKENS) options.MINTOKENS_SKIPGRAMS = options.MINTOKENS;
782  if (constrainbymodel == this) {
783  totaltypes = 0;
784  totaltokens = 0;
785  } else if (constrainbymodel != NULL) {
786  totaltypes = constrainbymodel->types();
787  totaltokens = constrainbymodel->tokens();
788  }
789  uint32_t sentence = firstsentence-1;
790  const unsigned char version = (in != NULL) ? getdataversion(in) : 2;
791 
792  bool iter_unigramsonly = false; //only needed for counting unigrams when we need them but they would be discarded
793  bool skipunigrams = false; //will be set to true later only when MINTOKENS=1,MINLENGTH=1 to prevent double counting of unigrams
794  if (( (options.MINLENGTH > 1) ||(options.MINTOKENS == 1)) && (options.MINTOKENS_UNIGRAMS > options.MINTOKENS)) {
795  iter_unigramsonly = true;
796  }
797 
798  if (!options.QUIET) {
799  std::cerr << "Training patternmodel";
800  if (constrainbymodel != NULL) std::cerr << ", constrained by another model";
801  std::cerr << ", occurrence threshold: " << options.MINTOKENS;
802  if (iter_unigramsonly) std::cerr << ", secondary word occurrence threshold: " << options.MINTOKENS_UNIGRAMS;
803  if (version < 2) std::cerr << ", class encoding version: " << (int) version;
804  std::cerr << std::endl;
805  }
806  std::vector<std::pair<PatternPointer,int> > ngrams;
807  std::vector<PatternPointer> subngrams;
808  bool found;
809  IndexReference ref;
810  int prevsize = this->size();
811  if (constrainbymodel == this) prevsize = 0; //going over same model
812  int backoffn = 0;
813  Pattern * linepattern = NULL;
814 
815  if (!this->data.empty()) {
816  if ((continued) && (!options.QUIET)) std::cerr << "Continuing training on preloaded model, computing statistics..." << std::endl;
817  this->computestats();
818  }
819 
820  for (int n = 1; n <= options.MAXLENGTH; n++) {
821  bool skipgramsonly = false; //only used when continued==true, prevent double counting of n-grams whilst allowing skipgrams to be counted later
822  if (continued) {
823  if ((options.MINTOKENS > 1) && (constrainbymodel == NULL)) {
824  if (cache_grouptotal[NGRAM][n] > 0) {
825  if ((options.DOSKIPGRAMS_EXHAUSTIVE) && (cache_grouptotal[SKIPGRAM][n] == 0) ) {
826  skipgramsonly= true;
827  } else {
828  if (!options.QUIET) std::cerr << "Skipping " << n << "-grams, already in model" << std::endl;
829  continue;
830  }
831  }
832  }
833  }
834  int foundngrams = 0;
835  int foundskipgrams = 0;
836  if (in != NULL) {
837  in->clear();
838  if (version >= 2) {
839  in->seekg(2);
840  } else {
841  in->seekg(0);
842  }
843  }
844  if (!options.QUIET) {
845  if (iter_unigramsonly) {
846  std::cerr << "Counting unigrams using secondary word occurrence threshold (" << options.MINTOKENS_UNIGRAMS << ")" << std::endl;
847  } else if (options.DOPATTERNPERLINE) {
848  std::cerr << "Counting patterns from list, one per line" << std::endl;
849  } else if (constrainbymodel != NULL) {
850  std::cerr << "Counting n-grams that occur in constraint model" << std::endl;
851  } else if (options.MINTOKENS > 1) {
852  std::cerr << "Counting " << n << "-grams" << std::endl;
853  if (skipgramsonly) std::cerr << "(only counting skipgrams actually, n-grams already counted earlier)" << std::endl;
854  } else {
855  std::cerr << "Counting *all* n-grams (occurrence threshold=1)" << std::endl;
856  }
857  }
858 
859  if ((options.DOSKIPGRAMS_EXHAUSTIVE) && (gapmasks[n].empty())) gapmasks[n] = compute_skip_configurations(n, options.MAXSKIPS);
860 
861 
862 
863  sentence = firstsentence-1; //reset
864  bool singlepass = false;
865  const unsigned int sentences = (reverseindex != NULL) ? reverseindex->sentences() : 0;
866  while (((reverseindex != NULL) && (sentence < sentences)) || ((reverseindex == NULL) && (in != NULL) && (!in->eof()))) {
867  sentence++;
868  //read line
869  if (linepattern != NULL) delete linepattern;
870  if (reverseindex == NULL) linepattern = new Pattern(in,false,version);
871  PatternPointer line = (reverseindex != NULL) ? reverseindex->getsentence(sentence) : PatternPointer(linepattern);
872  //if (in->eof()) break;
873  const unsigned int linesize = line.n();
874  if (options.DEBUG) std::cerr << "Processing line " << sentence << ", size (tokens) " << linesize << " (bytes) " << line.bytesize() << ", n=" << n << std::endl;
875  if (linesize == 0) {
876  //skip empty lines
877  continue;
878  }
879  //count total tokens
880  if ((n==1) && (!continued)) totaltokens += linesize;
881 
882 
883  ngrams.clear();
884  if (options.DOPATTERNPERLINE) {
885  if (linesize > (unsigned int) options.MAXLENGTH) continue;
886  ngrams.push_back(std::pair<PatternPointer,int>(line,0));
887  } else {
888  if (iter_unigramsonly) {
889  line.ngrams(ngrams, n);
890  } else if ((options.MINTOKENS > 1) && (constrainbymodel == NULL)) {
891  line.ngrams(ngrams, n);
892  } else {
893  singlepass = true;
894  int minlength = options.MINLENGTH;
895  if (continued) minlength = this->maxn + 1;
896  line.subngrams(ngrams,minlength,options.MAXLENGTH); //extract ALL ngrams if MINTOKENS == 1 or a constraint model is set, no need to look back anyway, only one iteration over corpus
897  }
898  }
899  if (options.DEBUG) std::cerr << "\t" << ngrams.size() << " ngrams in line" << std::endl;
900 
901 
902  // *** ITERATION OVER ALL NGRAMS OF CURRENT ORDER (n) IN THE LINE/SENTENCE ***
903  for (std::vector<std::pair<PatternPointer,int>>::iterator iter = ngrams.begin(); iter != ngrams.end(); iter++) {
904 
905  try {
906  if ((singlepass) && (options.MINLENGTH == 1) && (skipunigrams) && (iter->first.n() == 1)) {
907  //prevent double counting of unigrams after a iter_unigramsonly run with mintokens==1
908  continue;
909  }
910 
911  if (!skipgramsonly) {
912  //check against constraint model
913  if ((constrainbymodel != NULL) && (!iter_unigramsonly) && (!constrainbymodel->has(iter->first))) continue;
914 
915 
916  found = true; //are the submatches in order? (default to true, attempt to falsify, needed for mintokens==1)
917 
918  //unigram check, special scenario, not usually processed!! (normal lookback suffices for most uses)
919  if ((!iter_unigramsonly) && (options.MINTOKENS_UNIGRAMS > options.MINTOKENS) && ((n > 1) || (singlepass)) ) {
920  subngrams.clear();
921  iter->first.ngrams(subngrams,1); //get all unigrams
922  for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
923  //check if unigram reaches threshold
924  if (this->occurrencecount(*iter2) < (unsigned int) options.MINTOKENS_UNIGRAMS) {
925  found = false;
926  break;
927  }
928  }
929  }
930 
931 
932  //ngram (n-1) lookback
933  if ((found) && (n > 1) && (options.MINTOKENS > 1) && (!options.DOPATTERNPERLINE) && (constrainbymodel == NULL)) {
934  //check if sub-parts were counted
935  subngrams.clear();
936  backoffn = n - 1;
937  if (backoffn > options.MAXBACKOFFLENGTH) backoffn = options.MAXBACKOFFLENGTH;
938  iter->first.ngrams(subngrams, backoffn);
939  for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
940  if (!this->has(*iter2)) {
941  found = false;
942  break;
943  }
944  }
945  }
946 
947 
948  ref = IndexReference(sentence, iter->second); //this is one token, we add the tokens as we find them, one by one
949  if ((found) && (!skipgramsonly)) {
950  if (options.DEBUG) std::cerr << "\t\tAdding @" << ref.sentence << ":" << ref.token << " n=" << iter->first.n() << " category=" <<(int) iter->first.category()<< std::endl;
951  add(iter->first, ref);
952  }
953  }
954  if (((n >= 3) || (options.MINTOKENS == 1)) //n is always 1 when mintokens == 1 !!
955  && (options.DOSKIPGRAMS_EXHAUSTIVE)) {
956  int foundskipgrams_thisround = this->computeskipgrams(iter->first, options, &ref, NULL, constrainbymodel, true );
957  if (foundskipgrams_thisround > 0) hasskipgrams = true;
958  foundskipgrams += foundskipgrams_thisround;
959  }
960  } catch (std::exception &e) {
961  std::cerr << "ERROR: An internal error has occured during training!!!" << std::endl;
962  if (ignoreerrors) continue;
963  throw InternalError();
964  }
965  }
966  }
967 
968 
969  if (!iter_unigramsonly) {
970  foundngrams = this->size() - foundskipgrams - prevsize;
971 
972  if ((foundngrams) || (foundskipgrams)) {
973  if (n > this->maxn) this->maxn = n;
974  if (n < this->minn) this->minn = n;
975  } else {
976  if (!options.QUIET) std::cerr << "None found" << std::endl;
977  if (!continued) break;
978  }
979  if (!options.QUIET) std::cerr << " Found " << foundngrams << " ngrams...";
980  if (options.DOSKIPGRAMS_EXHAUSTIVE && !options.QUIET) std::cerr << foundskipgrams << " skipgram occurrences...";
981  if ((!continued) && ((constrainbymodel == NULL) or (constrainbymodel == this))) {
982  if ((options.MINTOKENS > 1) && (n == 1)) {
983  totaltypes = this->size(); //total unigrams, also those not in model
984  } else if ((options.MINTOKENS == 1) && (options.MINLENGTH == 1)) {
985  if (!options.QUIET) std::cerr << " computing total word types prior to pruning...";
986  totaltypes = totalwordtypesingroup(NGRAM,1);
987  if (!options.QUIET) std::cerr << totaltypes << "...";
988  }
989  }
990  unsigned int pruned;
991  if (singlepass) {
992  pruned = this->prune(options.MINTOKENS,0); //prune regardless of size
993  } else {
994  pruned = this->prune(options.MINTOKENS,n); //prune only in size-class
995  if ( (!options.DOSKIPGRAMS) && (!options.DOSKIPGRAMS_EXHAUSTIVE) && ( n - 1 >= 1) && ( (n - 1) < options.MINLENGTH) && (n - 1 != options.MAXBACKOFFLENGTH) &&
996  !( (n-1 == 1) && (options.MINTOKENS_UNIGRAMS > options.MINTOKENS) ) //don't delete unigrams if we're gonna need them
997  ) {
998  //we don't need n-1 anymore now we're done with n, it
999  //is below our threshold, prune it all (== -1)
1000  this->prune(-1, n-1);
1001  if (!options.QUIET) std::cerr << " (pruned last iteration due to minimum length)" << pruned;
1002  }
1003  }
1004  if (!options.QUIET) std::cerr << "pruned " << pruned;
1005  if (foundskipgrams) {
1006  unsigned int prunedextra;
1007  if ((options.MINTOKENS == 1) || (constrainbymodel != NULL)) {
1008  prunedextra = this->pruneskipgrams(options.MINTOKENS_SKIPGRAMS, options.MINSKIPTYPES, 0);
1009  } else {
1010  prunedextra = this->pruneskipgrams(options.MINTOKENS_SKIPGRAMS, options.MINSKIPTYPES, n);
1011  }
1012  if (prunedextra && !options.QUIET) std::cerr << " plus " << prunedextra << " extra skipgrams..";
1013  pruned += prunedextra;
1014  }
1015  if (!options.QUIET) std::cerr << "...total kept: " << (foundngrams + foundskipgrams) - pruned << std::endl;
1016  if (((options.MINTOKENS == 1) || (constrainbymodel != NULL))) break; //no need for further n iterations, we did all in one pass since there's no point in looking back
1017  } else { //iter_unigramsonly
1018  if (!options.QUIET) std::cerr << "found " << this->size() << std::endl;
1019 
1020  if ((!continued) && ((constrainbymodel == NULL) or (constrainbymodel == this))) {
1021  if (!options.QUIET) std::cerr << " computing total word types prior to pruning...";
1022  totaltypes = this->size();
1023  if (!options.QUIET) std::cerr << totaltypes << "...";
1024  }
1025  //prune the unigrams based on the word occurrence threshold
1026  this->prune(options.MINTOKENS_UNIGRAMS,1);
1027  //normal behaviour next round
1028  iter_unigramsonly = false;
1029  if ((n == 1) && (options.MINLENGTH ==1)) skipunigrams = true; //prevent double counting of unigrams
1030  //decrease n so it will be the same (always 1) next (and by definition last) iteration
1031  n--;
1032  }
1033  prevsize = this->size();
1034  }
1035  if (options.DOSKIPGRAMS && !options.DOSKIPGRAMS_EXHAUSTIVE) {
1036  this->trainskipgrams(options, constrainbymodel);
1037  }
1038  if (options.MINTOKENS == 1) {
1039  //needed to compute maxn, minn
1040  this->postread(options);
1041  }
1042  if (options.MAXBACKOFFLENGTH < options.MINLENGTH) {
1043  this->prune(-1, options.MAXBACKOFFLENGTH);
1044  }
1045  if ((options.MINLENGTH > 1) && (options.MINTOKENS_UNIGRAMS > options.MINTOKENS)) {
1046  //prune the unigrams again
1047  this->prune(-1,1);
1048  }
1049  if (options.PRUNENONSUBSUMED) {
1050  if (!options.QUIET) std::cerr << "Pruning non-subsumed n-grams" << std::endl;
1051  int begin_n = options.PRUNENONSUBSUMED;
1052  if ((begin_n > options.MAXLENGTH)) begin_n = options.MAXLENGTH;
1053  for (int n = begin_n; n > 1; n--) {
1054  std::unordered_set<Pattern> subsumed;
1055  unsigned int prunednonsubsumed = 0;
1056  PatternModel::iterator iter = this->begin();
1057  while (iter != this->end()) {
1058  const unsigned int pattern_n = iter->first.n();
1059  if (pattern_n == (unsigned int) n) {
1060  subngrams.clear();
1061  iter->first.ngrams(subngrams, n-1);
1062  for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) subsumed.insert(Pattern(*iter2));
1063  }
1064  iter++;
1065  };
1066  prunednonsubsumed += this->prunenotinset(subsumed, n-1);
1067  if (!options.QUIET) std::cerr << " pruned " << prunednonsubsumed << " non-subsumed " << (n-1) << "-grams" << std::endl;
1068  }
1069  }
1070  this->posttrain(options);
1071  if (linepattern != NULL) delete linepattern;
1072  }
1073 
1074 
1081  virtual void train(const std::string & filename, PatternModelOptions options, PatternModelInterface * constrainbymodel = NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) {
1082  if ((filename.size() > 3) && (filename.substr(filename.size()-3) == ".bz2")) {
1083  std::ifstream * in = new std::ifstream(filename.c_str(), std::ios::in|std::ios::binary);
1084  bz2istream * decompressor = new bz2istream(in->rdbuf());
1085  this->train( (std::istream*) decompressor, options, constrainbymodel, continued, firstsentence, ignoreerrors);
1086  delete decompressor;
1087  delete in;
1088  } else {
1089  std::ifstream * in = new std::ifstream(filename.c_str());
1090  this->train((std::istream*) in, options, constrainbymodel, continued, firstsentence, ignoreerrors);
1091  in->close();
1092  delete in;
1093  }
1094  }
1095 
1096 
1101  virtual int computeskipgrams(const PatternPointer & pattern, int mintokens = 2, const IndexReference * singleref= NULL, const IndexedData * multiplerefs = NULL, PatternModelInterface * constrainbymodel = NULL, std::vector<PatternPointer> * targetcontainer = NULL, const bool exhaustive = false, const int maxskips = 3, const bool DEBUG = false) {
1102 
1103  //if targetcontainer is NULL, skipgrams will be added to the model,
1104  // if not null , they will be added to the targetcontainer instead
1105 
1106  if (mintokens == -1) mintokens = 2;;
1107  if (mintokens <= 1) {
1108  mintokens = 1;
1109  }
1110  //internal function for computing skipgrams for a single pattern
1111  int foundskipgrams = 0;
1112  const int n = pattern.n();
1113  std::vector<PatternPointer> subngrams;
1114 
1115  if (gapmasks[n].empty()) gapmasks[n] = compute_skip_configurations(n, maxskips);
1116 
1117  //loop over all possible gap configurations
1118  int gapconf_i = 0;
1119  for (std::vector<uint32_t>::iterator iter2 = gapmasks[n].begin(); iter2 != gapmasks[n].end(); iter2++, gapconf_i++) {
1120  if (*iter2 == 0) continue; //precaution (doesn't really happen anyway, but better safe than sorry)
1121 
1122  //add skips
1123  try {
1124  PatternPointer skipgram = pattern;
1125  skipgram.mask = *iter2;
1126 
1127  if (DEBUG) {
1128  std::cerr << "Checking for: " << std::endl;
1129  skipgram.out();
1130  }
1131 
1132  if ((constrainbymodel != NULL) && (!constrainbymodel->has(skipgram))) continue;
1133 
1134  if (DEBUG) {
1135  if ((int) skipgram.n() != n) {
1136  std::cerr << "Generated invalid skipgram, n=" << skipgram.n() << ", expected " << n << std::endl;
1137  throw InternalError();
1138  }
1139  }
1140 
1141  bool skipgram_valid = true;
1142  if ((mintokens != 1) && (constrainbymodel == NULL)) {
1143  bool check_extra = false;
1144  //check if sub-parts were counted
1145  subngrams.clear();
1146  skipgram.ngrams(subngrams,n-1); //this also works for and returns skipgrams, despite the name
1147  for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) { //only two patterns
1148  const PatternPointer subpattern = *iter2;
1149  if (!subpattern.isgap(0) && !subpattern.isgap(subpattern.n() - 1)) {
1150  //this subpattern is a valid
1151  //skipgram or ngram (no beginning or ending
1152  //gaps) that should occur
1153  if (DEBUG) {
1154  std::cerr << "Subpattern: " << std::endl;
1155  subpattern.out();
1156  }
1157  if (!this->has(subpattern)) {
1158  if (DEBUG) std::cerr << " discarded" << std::endl;
1159  skipgram_valid = false;
1160  break;
1161  }
1162  } else {
1163  //this check isn't enough, subpattern
1164  //starts or ends with gap
1165  //do additional checks
1166  check_extra = true;
1167  break;
1168  }
1169  }
1170  if (!skipgram_valid) continue;
1171 
1172 
1173  if (check_extra) {
1174  if (exhaustive) { //the following is by definition the case in non-exhaustive mode, so we need only do it in exhaustive mode:
1175 
1176  //test whether parts occur in model, otherwise skip
1177  //can't occur either and we can discard it
1178  std::vector<PatternPointer> parts;
1179  skipgram.parts(parts);
1180  for (std::vector<PatternPointer>::iterator iter3 = parts.begin(); iter3 != parts.end(); iter3++) {
1181  const PatternPointer part = *iter3;
1182  if (!this->has(part)) {
1183  skipgram_valid = false;
1184  break;
1185  }
1186  }
1187  if (!skipgram_valid) continue;
1188  }
1189 
1190  //check whether the gaps with single token context (X * Y) occur in model,
1191  //otherwise skipgram can't occur
1192  const std::vector<std::pair<int,int>> gapconfiguration = mask2vector(skipgram.mask, n);
1193  for (std::vector<std::pair<int,int>>::const_iterator iter3 = gapconfiguration.begin(); iter3 != gapconfiguration.end(); iter3++) {
1194  if (!((iter3->first - 1 == 0) && (iter3->first + iter3->second + 1 == n))) { //entire skipgram is already X * Y format
1195  const PatternPointer subskipgram = PatternPointer(skipgram, iter3->first - 1, iter3->second + 2);
1196  if (DEBUG) {
1197  std::cerr << "Subskipgram: " << std::endl;
1198  subskipgram.out();
1199  }
1200  if (!this->has(subskipgram)) {
1201  if (DEBUG) std::cerr << " discarded" << std::endl;
1202  skipgram_valid = false;
1203  break;
1204  }
1205  }
1206  }
1207  }
1208  }
1209 
1210 
1211  if (skipgram_valid) {
1212  if (DEBUG) std::cerr << " counted!" << std::endl;
1213  if (targetcontainer == NULL) {
1214  //put in model
1215  if (!has(skipgram)) foundskipgrams++;
1216  if (singleref != NULL) {
1217  add(skipgram, *singleref ); //counts the actual skipgram, will add it to the model
1218  } else if (multiplerefs != NULL) {
1219  for (IndexedData::const_iterator refiter = multiplerefs->begin(); refiter != multiplerefs->end(); refiter++) {
1220  const IndexReference ref = *refiter;
1221  add(skipgram, ref ); //counts the actual skipgram, will add it to the model
1222  }
1223  } else {
1224  std::cerr << "ERROR: computeskipgrams() called with no singleref and no multiplerefs" << std::endl;
1225  throw InternalError();
1226  }
1227  } else {
1228  //put in target container, may contain duplicates
1229  foundskipgrams++;
1230  targetcontainer->push_back(skipgram);
1231  }
1232 
1233  }
1234 
1235  } catch (InternalError &e) {
1236  std::cerr << "IGNORING ERROR and continuing with next skipgram" << std::endl;
1237  }
1238  }
1239  return foundskipgrams;
1240  }
1241 
1245  virtual int computeskipgrams(const PatternPointer & pattern, PatternModelOptions & options , const IndexReference * singleref= NULL, const IndexedData * multiplerefs = NULL, PatternModelInterface * constrainbymodel = NULL, const bool exhaustive = false) { //backward compatibility
1246  if (options.MINTOKENS_SKIPGRAMS < options.MINTOKENS) options.MINTOKENS_SKIPGRAMS = options.MINTOKENS;
1247  return computeskipgrams(pattern, options.MINTOKENS_SKIPGRAMS, singleref, multiplerefs, constrainbymodel, NULL, exhaustive, options.MAXSKIPS,options.DEBUG);
1248  }
1249 
1254  virtual std::vector<PatternPointer> findskipgrams(const PatternPointer & pattern, unsigned int occurrencethreshold = 1, int maxskips = 3) {
1255  //given the pattern, find all skipgrams in it that occur in the model
1256 
1257  std::vector<PatternPointer> skipgrams;
1258  this->computeskipgrams(pattern, occurrencethreshold, NULL, NULL, this->getinterface(), &skipgrams, false, maxskips);
1259  return skipgrams;
1260  }
1261 
1262 
1266  virtual void trainskipgrams(const PatternModelOptions options, PatternModelInterface * constrainbymodel = NULL) {
1267  std::cerr << "Can not compute skipgrams on unindexed model (except exhaustively during train() )" << std::endl;
1268  throw InternalError();
1269  }
1270 
1271  //creates a new test model using the current model as training
1272  // i.e. only fragments existing in the training model are counted
1273  // remaining fragments are 'uncovered'
1274  void test(MapType & target, std::istream * in);
1275 
1279  void write(std::ostream * out) {
1280  const char null = 0;
1281  out->write( (char*) &null, sizeof(char));
1282  unsigned char t = this->getmodeltype();
1283  out->write( (char*) &t, sizeof(char));
1284  unsigned char v = this->getmodelversion();
1285  out->write( (char*) &v, sizeof(char));
1287  out->write( (char*) &this->corpussize, sizeof(unsigned int));
1288  out->write((char*) this->corpusstart, sizeof(unsigned char) * this->corpussize);
1289  }
1290  out->write( (char*) &totaltokens, sizeof(uint64_t));
1291  const uint64_t tp = this->types(); //use this instead of totaltypes, as it may need to be computed on-the-fly still
1292  out->write( (char*) &tp, sizeof(uint64_t));
1293  MapType::write(out); //write PatternStore
1294  }
1295 
1299  void write(const std::string filename) {
1300  std::ofstream * out = new std::ofstream(filename.c_str());
1301  this->write(out);
1302  out->close();
1303  delete out;
1304  }
1305 
1306  typedef typename MapType::iterator iterator;
1307  typedef typename MapType::const_iterator const_iterator;
1308 
1312  virtual int maxlength() const { return this->maxn; };
1316  virtual int minlength() const { return this->minn; };
1321  virtual unsigned int occurrencecount(const Pattern & pattern) {
1322  ValueType * data = this->getdata(pattern);
1323  if (data != NULL) {
1324  return this->valuehandler.count(*data);
1325  } else {
1326  return 0;
1327  }
1328  }
1329 
1330  virtual unsigned int occurrencecount(const PatternPointer & pattern) {
1331  ValueType * data = this->getdata(pattern);
1332  if (data != NULL) {
1333  return this->valuehandler.count(*data);
1334  } else {
1335  return 0;
1336  }
1337  }
1338 
1343  virtual ValueType * getdata(const Pattern & pattern, bool makeifnew=false) {
1344  typename MapType::iterator iter = this->find(pattern);
1345  if (iter != this->end()) {
1346  return &(iter->second);
1347  } else if (makeifnew) {
1348  return &((*this)[pattern]);
1349  } else {
1350  return NULL;
1351  }
1352  }
1353 
1354  virtual ValueType * getdata(const PatternPointer & pattern, bool makeifnew=false) {
1355  typename MapType::iterator iter = this->find(pattern);
1356  if (iter != this->end()) {
1357  return &(iter->second);
1358  } else if (makeifnew) {
1359  return &((*this)[pattern]);
1360  } else {
1361  return NULL;
1362  }
1363  }
1364 
1368  virtual unsigned int types() {
1369  if ((totaltypes == 0) && (!this->data.empty())) totaltypes = this->totalwordtypesingroup(0, 0);
1370  return totaltypes;
1371  }
1372 
1376  virtual unsigned int tokens() const { return totaltokens; }
1377 
1378  unsigned char type() const { return model_type; }
1379  unsigned char version() const { return model_version; }
1380 
1381  void output(std::ostream *);
1382 
1383 
1389  unsigned int coveragecount(const Pattern & key) {
1390  return this->occurrencecount(key) * key.size();
1391  }
1397  double coverage(const Pattern & key) {
1398  return this->coveragecount(key) / (double) this->tokens();
1399  }
1400 
1408  std::vector<PatternPointer> getreverseindex(const IndexReference ref, int occurrencecount = 0, int category = 0, unsigned int size = 0) {
1409  //Auxiliary function
1410  std::vector<PatternPointer> result;
1411  if (!this->reverseindex) return result;
1412  const unsigned int sl = this->reverseindex->sentencelength(ref.sentence);
1413  //std::cerr << "DEBUG: getreverseindex sentencelength(" << ref.sentence << ")=" << sl << std::endl;
1414  const unsigned int minn = this->minlength();
1415  const unsigned int maxn = this->maxlength();
1416  for (unsigned int n = minn; ref.token + n <= sl && n <= maxn; n++) {
1417  if ((size == 0) || (n == size)) {
1418  try {
1419  //std::cerr << "DEBUG: getreverseindex getpattern " << ref.tostring() << " + " << n << std::endl;
1420  const PatternPointer ngram = this->reverseindex->getpattern(ref,n);
1421  /*std::cerr << "n: " << ngram.n() << std::endl;
1422  std::cerr << "bytesize: " << ngram.bytesize() << std::endl;;
1423  std::cerr << "hash: " << ngram.hash() << std::endl;*/
1424  if ( (((occurrencecount == 0) && this->has(ngram)) || (this->occurrencecount(ngram) >= (unsigned int) occurrencecount))
1425  && ((category == 0) || (ngram.category() >= category)) ) {
1426  result.push_back(ngram);
1427 
1428  if (((category == 0) || (category == SKIPGRAM)) && (this->hasskipgrams)) {
1429 
1430  //(we can't use gettemplates() because
1431  //gettemplates() depends on us, we have to
1432  //solve it low-level, punching holes:
1433 
1434  std::vector<PatternPointer> skipgrams = this->findskipgrams(ngram, occurrencecount);
1435  for (auto skipgram : skipgrams) {
1436  result.push_back(skipgram);
1437  }
1438 
1439  //TODO: flexgrams
1440 
1441  }
1442  }
1443  } catch (KeyError &e) {
1444  break;
1445  }
1446  }
1447  }
1448  return result;
1449  }
1450 
1451  /*std::vector<Pattern> getreverseindex_bysentence(int sentence) {
1452  //Auxiliary function
1453  std::vector<Pattern> result;
1454  for (int i = 0; i < this->reverseindex.sentencelength(sentence); i++) {
1455  const IndexReference ref = IndexReference(sentence, i);
1456  std::vector<Pattern> tmpresult = this->getreverseindex(ref);
1457  for (std::vector<Pattern>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
1458  const Pattern pattern = *iter;
1459  result.push_back(pattern);
1460  }
1461  }
1462  return result;
1463  }*/
1464 
1471  std::vector<std::pair<IndexReference,PatternPointer>> getreverseindex_bysentence(int sentence) {
1472  //Auxiliary function
1473  std::vector<std::pair<IndexReference,PatternPointer>> result;
1474  for (int i = 0; i < this->reverseindex->sentencelength(sentence); i++) {
1475  const IndexReference ref = IndexReference(sentence, i);
1476  std::vector<PatternPointer> tmpresult = this->getreverseindex(ref);
1477  for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
1478  const PatternPointer pattern = *iter;
1479  result.push_back(std::pair<IndexReference,PatternPointer>(ref,pattern));
1480  }
1481  }
1482  return result;
1483  }
1484 
1489  std::vector<std::pair<IndexReference,PatternPointer>> getreverseindex_right(const IndexReference ref) {
1490  //Auxiliary function
1491  std::vector<std::pair<IndexReference,PatternPointer>> result;
1492  for (int i = ref.token+1; i < this->reverseindex->sentencelength(ref.sentence); i++) {
1493  const IndexReference ref2 = IndexReference(ref.sentence, i);
1494  std::vector<PatternPointer> tmpresult = this->getreverseindex(ref);
1495  for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
1496  const PatternPointer pattern = *iter;
1497  result.push_back(std::pair<IndexReference,PatternPointer>(ref2,pattern));
1498  }
1499  }
1500  return result;
1501  }
1502 
1507  std::vector<std::pair<IndexReference,PatternPointer>> getreverseindex_left(const IndexReference ref) {
1508  //Auxiliary function
1509  std::vector<std::pair<IndexReference,PatternPointer>> result;
1510  for (int i = 0; i < ref.token; i++) {
1511  const IndexReference ref2 = IndexReference(ref.sentence, i);
1512  std::vector<PatternPointer> tmpresult = this->getreverseindex(ref);
1513  for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
1514  const PatternPointer pattern = *iter;
1515  result.push_back(std::pair<IndexReference,PatternPointer>(ref2,pattern));
1516  }
1517  }
1518  return result;
1519  }
1520 
1526  void computestats() {
1527  cache_categories.clear();
1528  cache_n.clear();
1529  cache_grouptotal.clear();
1530  cache_grouptotalpatterns.clear();
1531  cache_categories.insert(0);
1532  cache_n.insert(0);
1533  PatternModel::iterator iter = this->begin();
1534  while (iter != this->end()) {
1535  const PatternType pattern = iter->first;
1536  const int c = pattern.category();
1537  cache_categories.insert(c);
1538  const int n = pattern.n();
1539  cache_n.insert(n);
1540 
1541  //total of occurrences in a group, used for frequency computation
1542  if (c != FLEXGRAM){
1543  //no storage per N for dynamic skipgrams
1544  cache_grouptotal[c][n] += this->valuehandler.count(iter->second);
1545  cache_grouptotal[0][n] += this->valuehandler.count(iter->second);
1546  cache_grouptotalpatterns[c][n]++;
1547  cache_grouptotalpatterns[0][n]++;
1548  }
1549  cache_grouptotal[c][0] += this->valuehandler.count(iter->second);
1550  cache_grouptotal[0][0] += this->valuehandler.count(iter->second);
1551 
1552  //total of distinct patterns in a group
1553  cache_grouptotalpatterns[c][0]++;
1554  cache_grouptotalpatterns[0][0]++;
1555  iter++;
1556  }
1557  }
1558 
1559  virtual void resetstats() {
1560  cache_grouptotalwordtypes.clear();
1561  cache_grouptotaltokens.clear();
1562  }
1563 
1569  virtual void computecoveragestats(int category = 0, int n = 0) {
1570  if ((cache_grouptotal.empty()) && (!this->data.empty())) this->computestats();
1571  //bool hasunigrams = false;
1572 
1573  //opting for memory over speed (more iterations, less memory)
1574  // Indexed model overloads this for better cache_grouptotaltokens computation!
1575  for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {
1576  if ((category == 0) || (*iterc == category)) {
1577  for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {
1578  if (((n == 0) || (*itern == n)) && (cache_grouptotalwordtypes[*iterc][*itern] == 0) ) {
1579  std::unordered_set<Pattern> types;
1580  PatternModel::iterator iter = this->begin();
1581  while (iter != this->end()) {
1582  const PatternType pattern = iter->first;
1583  const int pn = (int) pattern.n();
1584  if ( (pn == 1) && (*itern <= 1) && ((*iterc == 0) || (pattern.category() == *iterc))) {
1585  types.insert(pattern);
1586  } else {
1587  if (((*itern == 0) || (pn == *itern)) && ((*iterc == 0) || (pattern.category() == *iterc))) {
1588  std::vector<PatternType> unigrams;
1589  pattern.ngrams(unigrams, 1);
1590  for (typename std::vector<PatternType>::iterator iter2 = unigrams.begin(); iter2 != unigrams.end(); iter2++) {
1591  const PatternType p = *iter2;
1592  types.insert(p);
1593  }
1594  }
1595  }
1596  cache_grouptotaltokens[*iterc][*itern] += this->valuehandler.count(iter->second);
1597  iter++;
1598  }
1599  cache_grouptotalwordtypes[*iterc][*itern] += types.size();
1600  }
1601  }
1602  }
1603  }
1604 
1605  /*
1606  if (!hasunigrams) {
1607  for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {
1608  int max = 0;
1609  for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {
1610  if cache_grouptotalwordtypes[*iterc][*itern]
1611  }
1612  }
1613 
1614  }*/
1615  }
1616 
1617 
1623  unsigned int totaloccurrencesingroup(int category, int n) {
1624  //category and n can be set to 0 to loop over all
1625  if ((cache_grouptotal.empty()) && (!this->data.empty())) this->computestats();
1626  return cache_grouptotal[category][n];
1627  }
1628 
1634  unsigned int totalpatternsingroup(int category, int n) {
1635  //category and n can be set to 0 to loop over all
1636  if ((cache_grouptotalpatterns.empty()) && (!this->data.empty())) this->computestats();
1637  return cache_grouptotalpatterns[category][n];
1638  }
1639 
1645  unsigned int totalwordtypesingroup(int category, int n) {
1646  //total covered word/unigram types
1647  //category and n can be set to 0 to loop over all
1648  if ((cache_grouptotalwordtypes.empty()) && (!this->data.empty())) this->computecoveragestats(category,n);
1649  return cache_grouptotalwordtypes[category][n];
1650  }
1656  unsigned int totaltokensingroup(int category, int n) {
1657  //total COVERED tokens
1658  //category and n can be set to 0 to loop over all
1659  if ((cache_grouptotaltokens.empty()) && (!this->data.empty())) this->computecoveragestats(category,n);
1660  return cache_grouptotaltokens[category][n];
1661  }
1662 
1666  double frequency(const Pattern & pattern) {
1667  //frequency within the same n and category class
1668  return this->occurrencecount(pattern) / (double) totaloccurrencesingroup(pattern.category(),pattern.n());
1669  }
1670 
1671 
1672 
1680  virtual void add(const PatternPointer & patternpointer, const IndexReference & ref) {
1681  const Pattern pattern = Pattern(patternpointer);
1682  /*if ((pattern.isskipgram()) || (pattern.isflexgram())) { //TODO: remove
1683  std::cerr << "Adding skipgram!" << std::endl;
1684  std::cerr << "pp.mask=" << patternpointer.mask << std::endl;
1685  std::cerr << "pp.b=" << patternpointer.bytesize() << std::endl;
1686  std::cerr << "p.b=" << pattern.bytesize() << std::endl;
1687  patternpointer.out();
1688  std::cerr << std::endl;
1689  pattern.out();
1690  throw InternalError();
1691  }*/
1692  ValueType * data = getdata(pattern, true);
1693  this->add(pattern, data, ref );
1694  }
1695 
1704  virtual void add(const Pattern & pattern, ValueType * value, const IndexReference & ref) {
1705  if (value == NULL) {
1706  std::cerr << "Add() value is NULL!" << std::endl;
1707  throw InternalError();
1708  }
1709  this->valuehandler.add(value, ref);
1710  }
1711  virtual void add(const PatternPointer & pattern, ValueType * value, const IndexReference & ref) {
1712  if (value == NULL) {
1713  std::cerr << "Add() value is NULL!" << std::endl;
1714  throw InternalError();
1715  }
1716  this->valuehandler.add(value, ref);
1717  }
1718 
1719 
1728  unsigned int prune(int threshold,int _n=0) {
1729  //prune all patterns under the specified threshold (set -1 for
1730  //all) and of the specified length (set _n==0 for all)
1731  unsigned int pruned = 0;
1732  PatternModel::iterator iter = this->begin();
1733  while (iter != this->end()) {
1734  const PatternType pattern = iter->first;
1735  if (( (_n == 0) || (pattern.n() == (unsigned int) _n) )&& ((threshold == -1) || (occurrencecount(pattern) < (unsigned int) threshold))) {
1736  /*std::cerr << "preprune:" << this->size() << std::endl;
1737  std::cerr << "DEBUG: pruning " << (int) pattern.category() << ",n=" << pattern.n() << ",skipcount=" << pattern.skipcount() << ",hash=" << pattern.hash() << std::endl;
1738  std::cerr << occurrencecount(pattern) << std::endl;*/
1739  iter = this->erase(iter);
1740  //std::cerr << "postprune:" << this->size() << std::endl;
1741  pruned++;
1742  } else {
1743  iter++;
1744  }
1745  };
1746 
1747  return pruned;
1748  }
1749 
1758  virtual unsigned int pruneskipgrams(unsigned int threshold, int minskiptypes=2, int _n = 0) {
1759  //NOTE: minskiptypes is completely ignored! that only works for indexed models
1760  unsigned int pruned = 0;
1761  if (minskiptypes <=1) return pruned; //nothing to do
1762 
1763  typename PatternModel<ValueType,BaseValueHandler<ValueType>,MapType>::iterator iter = this->begin();
1764  while(iter != this->end()) {
1765  const PatternType pattern = iter->first;
1766  if (( (_n == 0) || ((int) pattern.n() == _n) ) && (pattern.category() == SKIPGRAM)) {
1767  if (this->occurrencecount(pattern) < threshold) {
1768  iter = this->erase(iter);
1769  pruned++;
1770  continue;
1771  }
1772  }
1773  iter++;
1774  }
1775  return pruned;
1776  }
1777 
1784  unsigned int prunenotinset(const std::unordered_set<Pattern> & s, int _n) {
1785  unsigned int pruned = 0;
1786  if (s.empty()) {
1787  return pruned;
1788  }
1789  PatternModel::iterator iter = this->begin();
1790  while (iter != this->end()) {
1791  const PatternType pattern = iter->first;
1792  if ( (_n == 0) || (pattern.n() == (unsigned int) _n) ) {
1793  if (s.find(pattern) == s.end()) {
1794  //not found in set
1795  iter = this->erase(iter);
1796  pruned++;
1797  continue;
1798  }
1799  }
1800  iter++;
1801  };
1802 
1803  return pruned;
1804  }
1805 
1810  template<class ValueType2,class ValueHandler2,class MapType2>
1812  //is not used by default when working with constraint models
1813  //anymore, is directly processing during loading instead
1814  //
1815  //this is still useful if you have two models in memory though
1816  unsigned int pruned = 0;
1818  while(iter != this->end()) {
1819  const PatternType pattern = iter->first;
1820  if (!secondmodel.has(pattern)) {
1821  iter = this->erase(iter);
1822  pruned++;
1823  continue;
1824  }
1825  iter++;
1826  }
1827  return pruned;
1828  }
1829 
1834  std::vector<std::pair<Pattern, int> > getpatterns(const Pattern & pattern) {
1835  //get all patterns in pattern
1836  std::vector<std::pair<Pattern, int> > v;
1837  std::vector<std::pair<Pattern, int> > ngrams;
1838  pattern.subngrams(ngrams, minlength(), maxlength());
1839  for (std::vector<std::pair<Pattern, int> >::iterator iter = ngrams.begin(); iter != ngrams.end(); iter++) {
1840  const Pattern p = iter->first;
1841  if (this->has(p)) v.push_back(*iter);
1842 
1843  //TODO: match with skipgrams
1844  }
1845  return v;
1846  }
1847 
1854  virtual void print(std::ostream * out, ClassDecoder & decoder) {
1855  bool haveoutput = false;
1856  for (PatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {
1857  if (!haveoutput) {
1858  *out << "PATTERN\tCOUNT\tTOKENS\tCOVERAGE\tCATEGORY\tSIZE\tFREQUENCY" << std::endl;
1859  haveoutput = true;
1860  }
1861  const PatternType pattern = iter->first;
1862  this->print(out, decoder, pattern, true);
1863  }
1864  if (haveoutput) {
1865  std::cerr << std::endl << "Legend:" << std::endl;
1866  std::cerr << " - PATTERN : The pattern, Gaps in skipgrams are represented as {*}. Variable-width gaps in flexgrams are shown using {**}." << std::endl;
1867  std::cerr << " - COUNT : The occurrence count - the amount of times the pattern occurs in the data" << std::endl;
1868  std::cerr << " - TOKENS : The maximum number of tokens in the corpus that this pattern covers. *THIS IS JUST A MAXIMUM PROJECTION* rather than an exact number because your model is not indexed" << std::endl;
1869  std::cerr << " - COVERAGE : The maximum number of tokens covered, as a fraction of the total in the corpus (projection)" << std::endl;
1870  std::cerr << " - CATEGORY : The pattern type category (ngram,skipgram,flexgram)" << std::endl;
1871  std::cerr << " - SIZE : The size of the pattern (in tokens)" << std::endl;
1872  std::cerr << " - FREQUENCY : The frequency of the pattern *within it's pattern type category and size-class*." << std::endl;
1873  std::cerr << " - REFERENCES : A space-delimited list of sentence:token position where the pattern occurs in the data. Sentences start at 1, tokens at 0" << std::endl;
1874  }
1875  }
1876 
1883  virtual void printreverseindex(std::ostream * out, ClassDecoder & decoder) {
1884  if (!this->reverseindex) return;
1885  for (IndexedCorpus::iterator iter = reverseindex->begin(); iter != reverseindex->end(); iter++) {
1886  const IndexReference ref = iter->first;
1887  std::vector<PatternPointer> rindex = this->getreverseindex(ref);
1888  *out << ref.tostring();
1889  for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
1890  const Pattern p = *iter2;
1891  *out << "\t" << p.tostring(decoder);
1892  }
1893  *out << "\n";
1894  }
1895  *out << std::endl;
1896  }
1897 
1898 
1902  void printmodel(std::ostream * out, ClassDecoder & decoder) { //an alias because cython can't deal with a method named print
1903  this->print(out, decoder);
1904  }
1905 
1911  virtual void print(std::ostream* out, ClassDecoder &decoder, const PatternType & pattern, bool endline = true) {
1912  const std::string pattern_s = pattern.tostring(decoder);
1913  const unsigned int count = this->occurrencecount(pattern);
1914  const unsigned int covcount = this->coveragecount(pattern);
1915  const double coverage = covcount / (double) this->tokens();
1916  const double freq = this->frequency(pattern);
1917  const int cat = pattern.category();
1918  std::string cat_s;
1919  if (cat == 1) {
1920  cat_s = "ngram";
1921  } else if (cat == 2) {
1922  cat_s = "skipgram";
1923  } else if (cat == 3) {
1924  cat_s = "flexgram";
1925  }
1926  *out << pattern_s << "\t" << count << "\t" << "\t" << covcount << "\t" << coverage << "\t" << cat_s << "\t" << pattern.size() << "\t" << freq;
1927  if (endline) *out << std::endl;
1928  //*out << pattern.hash() << "\t" << (size_t) pattern.data << std::endl;
1929  }
1930 
1931 
1935  void printpattern(std::ostream* out, ClassDecoder &decoder, const Pattern & pattern, bool endline = true) { //another alias for cython who can't deal with methods named print
1936  return this->print(out,decoder,pattern,endline);
1937  }
1938 
1939 
1948  void histogram(std::map<unsigned int,unsigned int> & hist, unsigned int threshold = 0, unsigned int cap = 0, int category = 0, unsigned int size = 0) {
1949  for (PatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {
1950  const PatternType pattern = iter->first;
1951  if (((category != 0) && (pattern.category() != category)) || ((size != 0) && (size != pattern.size()))) continue;
1952  unsigned int c = this->occurrencecount(pattern);
1953  if (c >= threshold) hist[c]++;
1954  }
1955  if (cap > 0) {
1956  unsigned int sum = 0;
1957  std::map<unsigned int,unsigned int>::reverse_iterator iter = hist.rbegin();
1958  while ((sum < cap) && (iter != hist.rend())) {
1959  iter++;
1960  sum += iter->second;
1961  }
1962  //delete everything else
1963  hist.erase(iter.base(), hist.end());
1964  }
1965  }
1966 
1967  unsigned int topthreshold(int amount, int category=0, int size=0) {
1968  //compute occurrence threshold that holds the top $amount occurrences
1969  std::map<unsigned int,unsigned int> hist;
1970  histogram(hist, 0, amount, category, size);
1971  std::map<unsigned int,unsigned int>::reverse_iterator iter = hist.rbegin();
1972  if (iter != hist.rend()) {
1973  return iter->first;
1974  } else {
1975  return 0;
1976  }
1977  }
1978 
1979 
1988  void histogram(std::ostream * OUT, unsigned int threshold = 0, unsigned int cap = 0 , int category = 0, unsigned int size = 0) {
1989  std::map<unsigned int,unsigned int> hist;
1990  histogram(hist,threshold,cap,category,size);
1991  *OUT << "HISTOGRAM" << std::endl;
1992  *OUT << "------------------------------" << std::endl;
1993  *OUT << "OCCURRENCES\tPATTERNS" << std::endl;
1994  for (std::map<unsigned int,unsigned int>::iterator iter = hist.begin(); iter != hist.end(); iter++) {
1995  *OUT << iter->first << "\t" << iter->second << std::endl;
1996  }
1997  }
1998 
2002  void info(std::ostream * OUT) {
2003  if (this->getmodeltype() == INDEXEDPATTERNMODEL) {
2004  *OUT << "Type: indexed" << std::endl;
2005  } else if (this->getmodeltype() == UNINDEXEDPATTERNMODEL) {
2006  *OUT << "Type: unindexed" << std::endl;
2007  } else {
2008  //should never happen
2009  *OUT << "Type: unknown" << std::endl;
2010  }
2011  *OUT << "Total tokens: " << this->totaltokens << std::endl;
2012  *OUT << "Total word types: " << this->totaltypes << std::endl;
2013  *OUT << "Types patterns loaded: " << this->size() << std::endl;
2014  *OUT << "Min n: " << this->minn << std::endl;
2015  *OUT << "Max n: " << this->maxn << std::endl;
2016  if (this->reverseindex) {
2017  *OUT << "Reverse index: yes" << std::endl;
2018  *OUT << "References in reverse index: " << this->reverseindex->size() << std::endl;
2019  } else {
2020  *OUT << "Reverse index: no" << std::endl;
2021  }
2022  *OUT << "Size of Pattern: " << sizeof(Pattern) << " byte" << std::endl;
2023  *OUT << "Size of ValueType: " << sizeof(ValueType) << " byte" << std::endl;
2024  unsigned int totalkeybs = 0;
2025  unsigned int totalvaluebs = 0;
2026  for (PatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {
2027  const PatternType pattern = iter->first;
2028  totalkeybs += sizeof(PatternType) + pattern.bytesize();
2029  totalvaluebs += sizeof(ValueType);
2030  }
2031  *OUT << "Total key bytesize (patterns): " << totalkeybs << " bytes (" << (totalkeybs/1024/1024) << " MB)" << std::endl;
2032  *OUT << "Total value bytesize (counts/index): " << totalvaluebs << " bytes (" << (totalvaluebs/1024/1024) << " MB)" << std::endl;
2033  *OUT << "Mean key bytesize: " << (totalkeybs / (float) this->size()) << std::endl;
2034  *OUT << "Mean value bytesize: " << (totalvaluebs / (float) this->size()) << std::endl;
2035 
2036  unsigned int ri_totalkeybs = 0;
2037  unsigned int ri_totalvaluebs = 0;
2038  if (this->reverseindex) {
2039  for (IndexedCorpus::iterator iter = this->reverseindex->begin(); iter != this->reverseindex->end(); iter++) {
2040  ri_totalkeybs += sizeof(iter->first.sentence) + sizeof(iter->first.token);
2041  ri_totalvaluebs += sizeof(IndexPattern); // sizeof(Pattern) + iter->pattern().bytesize();
2042  }
2043  *OUT << "Total key bytesize in reverse index (references): " << ri_totalkeybs << " bytes (" << (ri_totalkeybs/1024/1024) << " MB)" << std::endl;
2044  *OUT << "Total value bytesize in reverse index (patterns): " << ri_totalvaluebs << " bytes (" << (ri_totalvaluebs/1024/1024) << " MB)" << std::endl;
2045  }
2046 
2047 
2048  const unsigned int t = (totalkeybs + totalvaluebs + ri_totalkeybs + ri_totalvaluebs);
2049  *OUT << "Total bytesize (without overhead): " << t << " bytes (" << (t/1024/1024) << " MB)" << std::endl;
2050  }
2051 
2056  void report(std::ostream * OUT) {
2057  if ((cache_grouptotaltokens.empty()) && (!this->data.empty())) {
2058  std::cerr << "Computing statistics..." << std::endl;
2059  this->computecoveragestats();
2060  }
2061  *OUT << std::setiosflags(std::ios::fixed) << std::setprecision(4) << std::endl;
2062  *OUT << "REPORT" << std::endl;
2063  if (this->getmodeltype() == UNINDEXEDPATTERNMODEL) {
2064  *OUT << " Warning: Model is unindexed, token coverage counts are mere maximal projections" << std::endl;
2065  *OUT << " assuming no overlap at all!!! Use an indexed model for accurate coverage counts" << std::endl;
2066  }
2067  *OUT << "----------------------------------" << std::endl;
2068  *OUT << " " << std::setw(15) << "PATTERNS" << std::setw(15) << "TOKENS" << std::setw(15) << "COVERAGE" << std::setw(15) << "TYPES" << std::setw(15) << std::endl;
2069  *OUT << "Total: " << std::setw(15) << "-" << std::setw(15) << this->tokens() << std::setw(15) << "-" << std::setw(15) << this->types() << std::endl;
2070 
2071  unsigned int coveredtypes = totalwordtypesingroup(0,0); //will also work when no unigrams in model!
2072  unsigned int coveredtokens = totaltokensingroup(0,0);
2073 
2074  if (coveredtokens > this->tokens()) coveredtokens = this->tokens();
2075  unsigned int uncoveredtokens = this->tokens() - coveredtokens;
2076  if (uncoveredtokens < 0) uncoveredtokens = 0;
2077  *OUT << "Uncovered: " << std::setw(15) << "-" << std::setw(15) << uncoveredtokens << std::setw(15) << uncoveredtokens / (double) this->tokens() << std::setw(15) << this->types() - coveredtypes << std::endl;
2078  *OUT << "Covered: " << std::setw(15) << this->size() << std::setw(15) << coveredtokens << std::setw(15) << coveredtokens / (double) this->tokens() << std::setw(15) << coveredtypes << std::endl << std::endl;
2079 
2080 
2081 
2082  bool haveoutput = false;
2083  for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {
2084  const int c = *iterc;
2085  if (cache_grouptotalpatterns.count(c))
2086  for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {
2087  const int n = *itern;
2088  if (cache_grouptotalpatterns[c].count(n)) {
2089  if (!haveoutput) {
2090  //output headers
2091  *OUT << std::setw(15) << "CATEGORY" << std::setw(15) << "N (SIZE) "<< std::setw(15) << "PATTERNS";
2092  if (this->getmodeltype() != UNINDEXEDPATTERNMODEL) *OUT << std::setw(15) << "TOKENS" << std::setw(15) << "COVERAGE";
2093  *OUT << std::setw(15) << "TYPES" << std::setw(15) << "OCCURRENCES" << std::endl;
2094  haveoutput = true;
2095  }
2096  //category
2097  if (c == 0) {
2098  *OUT << std::setw(15) << "all";
2099  } else if (c == NGRAM) {
2100  *OUT << std::setw(15) << "n-gram";
2101  } else if (c == SKIPGRAM) {
2102  *OUT << std::setw(15) << "skipgram";
2103  } else if (c == FLEXGRAM) {
2104  *OUT << std::setw(15) << "flexgram";
2105  }
2106  //size
2107  if (n == 0) {
2108  *OUT << std::setw(15) << "all";
2109  } else {
2110  *OUT << std::setw(15) << n;
2111  }
2112  //patterns
2113  *OUT << std::setw(15) << cache_grouptotalpatterns[c][n];
2114  if (this->getmodeltype() != UNINDEXEDPATTERNMODEL) {
2115  //tokens
2116  *OUT << std::setw(15) << cache_grouptotaltokens[c][n];
2117  //coverage
2118  *OUT << std::setw(15) << cache_grouptotaltokens[c][n] / (double) this->tokens();
2119  }
2120  //types
2121  *OUT << std::setw(15) << cache_grouptotalwordtypes[c][n];
2122  //occurrences
2123  *OUT << std::setw(15) << cache_grouptotal[c][n] << std::endl;;
2124  }
2125  }
2126  }
2127 
2128  if (haveoutput) {
2129  std::cerr << std::endl << "Legend:" << std::endl;
2130  std::cerr << " - PATTERNS : The number of distinct patterns within the group" << std::endl;
2131  if (this->getmodeltype() != UNINDEXEDPATTERNMODEL) {
2132  std::cerr << " - TOKENS : The number of tokens that is covered by the patterns in the group." << std::endl;
2133  std::cerr << " - COVERAGE : The number of tokens covered, as a fraction of the total in the corpus" << std::endl;
2134  }
2135  std::cerr << " - TYPES : The number of unique *word/unigram* types in this group" << std::endl;
2136  std::cerr << " - OCCURRENCES : The total number of occurrences of the patterns in this group" << std::endl;
2137  }
2138  }
2139 
2140 
2148 
2149  PatternSet<uint64_t> result;
2150  for (PatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {
2151  const PatternType pattern = iter->first;
2152  const int patternlength = pattern.n();
2153  if ((patternlength >= minlength) && (patternlength <= maxlength)) {
2154  result.insert(pattern);
2155  } else if (patternlength > maxlength) {
2156  std::vector<Pattern> subngrams;
2157  pattern.subngrams(subngrams,minlength, maxlength);
2158  for (std::vector<Pattern>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
2159  const Pattern pattern2 = *iter2;
2160  result.insert(pattern2);
2161  }
2162  }
2163  }
2164  return result;
2165  }
2166 
2167 
2168  virtual void outputrelations(const Pattern & pattern, ClassDecoder & classdecoder, std::ostream * OUT) {} //does nothing for unindexed models
2169  virtual t_relationmap getsubchildren(const Pattern & pattern,int = 0, int = 0, int = 0) { return t_relationmap(); } //does nothing for unindexed models
2170  virtual t_relationmap getsubparents(const Pattern & pattern,int = 0, int = 0, int = 0) { return t_relationmap(); } //does nothing for unindexed models
2171  virtual t_relationmap gettemplates(const Pattern & pattern,int = 0) { return t_relationmap(); } //does nothing for unindexed models
2172  virtual t_relationmap getinstances(const Pattern & pattern,int = 0) { return t_relationmap(); } //does nothing for unindexed models
2173  virtual t_relationmap getskipcontent(const PatternPointer & pattern) { return t_relationmap(); } //does nothing for unindexed models
2174  virtual t_relationmap getleftneighbours(const Pattern & pattern,int = 0, int = 0,int = 0,int =0) { return t_relationmap(); } //does nothing for unindexed models
2175  virtual t_relationmap getrightneighbours(const Pattern & pattern,int = 0, int = 0,int = 0,int =0) { return t_relationmap(); } //does nothing for unindexed models
2176  virtual t_relationmap_double getnpmi(const Pattern & pattern, double threshold) { return t_relationmap_double(); } //does nothing for unindexed models
2177  virtual int computeflexgrams_fromskipgrams() { return 0; }//does nothing for unindexed models
2178  virtual int computeflexgrams_fromcooc() {return 0; }//does nothing for unindexed models
2179  virtual void outputcooc_npmi(std::ostream * OUT, ClassDecoder& classdecoder, double threshold) {}
2180  virtual void outputcooc(std::ostream * OUT, ClassDecoder& classdecoder, double threshold) {}
2181 };
2182 
2183 
2184 
2185 
2191 template<class MapType = PatternMap<IndexedData,IndexedDataHandler>,class PatternType = Pattern>
2192 class IndexedPatternModel: public PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType> {
2193  protected:
2194  virtual void postread(const PatternModelOptions options) {
2195  for (typename PatternModel<IndexedData,IndexedDataHandler,MapType>::iterator iter = this->begin(); iter != this->end(); iter++) {
2196  const Pattern p = iter->first;
2197  const int n = p.n();
2198  if (n > this->maxn) this->maxn = n;
2199  if (n < this->minn) this->minn = n;
2200  if ((!this->hasskipgrams) && (p.isskipgram())) this->hasskipgrams = true;
2201  }
2202  }
2203  virtual void posttrain(const PatternModelOptions options) {
2204  if (!options.QUIET) std::cerr << "Sorting all indices..." << std::endl;
2205  for (typename PatternModel<IndexedData,IndexedDataHandler,MapType>::iterator iter = this->begin(); iter != this->end(); iter++) {
2206  iter->second.sort();
2207  }
2208 
2209  }
2210  public:
2211 
2212 
2217  this->model_type = this->getmodeltype();
2218  this->model_version = this->getmodelversion();
2219  if (corpus) {
2220  this->reverseindex = corpus;
2221  this->attachcorpus(*corpus);
2222  } else {
2223  this->reverseindex = NULL;
2224  }
2225  }
2226 
2235  this->model_type = this->getmodeltype();
2236  this->model_version = this->getmodelversion();
2237  if (corpus) {
2238  this->reverseindex = corpus;
2239  this->attachcorpus(*corpus);
2240  } else {
2241  this->reverseindex = NULL;
2242  }
2243  this->load(f,options, constrainmodel);
2244  }
2245 
2253  IndexedPatternModel<MapType,PatternType>(const std::string filename, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL): PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>() { //load from file
2254  this->model_type = this->getmodeltype();
2255  this->model_version = this->getmodelversion();
2256  if (corpus) {
2257  this->reverseindex = corpus;
2258  this->attachcorpus(*corpus);
2259  } else {
2260  this->reverseindex = NULL;
2261  }
2262  std::ifstream * in = new std::ifstream(filename.c_str());
2263  this->load( (std::istream *) in, options, constrainmodel);
2264  in->close();
2265  delete in;
2266  }
2267 
2269 
2270 
2271  int getmodeltype() const { return INDEXEDPATTERNMODEL; }
2272  int getmodelversion() const { return 2;}
2273 
2274 
2282  virtual void add(const Pattern & pattern, IndexedData * value, const IndexReference & ref) {
2283  if (value == NULL) {
2284  value = getdata(pattern,true);
2285  }
2286  this->valuehandler.add(value, ref);
2287  }
2288  virtual void add(const PatternPointer & patternpointer, IndexedData * value, const IndexReference & ref) {
2289  if (value == NULL) {
2290  value = getdata(patternpointer,true);
2291  }
2292  this->valuehandler.add(value, ref);
2293  }
2294 
2299  IndexedData * getdata(const Pattern & pattern, bool makeifnew=false) {
2300  typename MapType::iterator iter = this->find(pattern);
2301  if (iter != this->end()) {
2302  return &(iter->second);
2303  } else if (makeifnew) {
2304  return &((*this)[pattern]);
2305  } else {
2306  return NULL;
2307  }
2308  }
2309 
2310  IndexedData * getdata(const PatternPointer & pattern, bool makeifnew=false) {
2311  typename MapType::iterator iter = this->find(pattern);
2312  if (iter != this->end()) {
2313  return &(iter->second);
2314  } else if (makeifnew) {
2315  return &((*this)[pattern]);
2316  } else {
2317  return NULL;
2318  }
2319  }
2320 
2321  virtual void train(std::istream * in , PatternModelOptions options, PatternModelInterface * constrainbymodel = NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) {
2322  if ((options.DOSKIPGRAMS) && (this->reverseindex == NULL)) {
2323  std::cerr << "ERROR: You must specify a reverse index if you want to train skipgrams (or train skipgrams exhaustively)" << std::endl;
2324  throw InternalError();
2325  }
2326  PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::train(in,options,constrainbymodel,continued,firstsentence,ignoreerrors);
2327  }
2328 
2329  virtual void train(const std::string & filename, PatternModelOptions options, PatternModelInterface * constrainbymodel = NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) {
2330  if ((options.DOSKIPGRAMS) && (this->reverseindex == NULL)) {
2331  std::cerr << "ERROR: You must specify a reverse index if you want to train skipgrams (or train skipgrams exhaustively)" << std::endl;
2332  throw InternalError();
2333  }
2334  PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::train(filename,options,constrainbymodel,continued,firstsentence,ignoreerrors);
2335  }
2336 
2340  void info(std::ostream * OUT) {
2341  if (this->getmodeltype() == INDEXEDPATTERNMODEL) {
2342  *OUT << "Type: indexed" << std::endl;
2343  } else if (this->getmodeltype() == UNINDEXEDPATTERNMODEL) {
2344  *OUT << "Type: unindexed" << std::endl;
2345  } else {
2346  //should never happen
2347  *OUT << "Type: unknown" << std::endl;
2348  }
2349  *OUT << "Total tokens: " << this->totaltokens << std::endl;
2350  *OUT << "Total word types: " << this->totaltypes << std::endl;
2351  *OUT << "Types patterns loaded: " << this->size() << std::endl;
2352  *OUT << "Min n: " << this->minn << std::endl;
2353  *OUT << "Max n: " << this->maxn << std::endl;
2354  if (this->reverseindex) {
2355  *OUT << "Reverse index: yes" << std::endl;
2356  *OUT << "References in reverse index: " << this->reverseindex->size() << std::endl;
2357  } else {
2358  *OUT << "Reverse index: no" << std::endl;
2359  }
2360  *OUT << "Size of Pattern: " << sizeof(Pattern) << " byte" << std::endl;
2361  unsigned int totalkeybs = 0;
2362  unsigned int totalvaluebs = 0;
2363  unsigned int indexlengthsum = 0;
2364  for (typename IndexedPatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {
2365  const Pattern pattern = iter->first;
2366  totalkeybs += sizeof(Pattern) + pattern.bytesize();
2367  totalvaluebs += iter->second.size() * sizeof(IndexReference); //sentence + token;
2368  indexlengthsum += iter->second.size();
2369  }
2370  *OUT << "Total key bytesize (patterns): " << totalkeybs << " bytes (" << (totalkeybs/1024/1024) << " MB)" << std::endl;
2371  *OUT << "Total value bytesize (counts/index): " << totalvaluebs << " bytes (" << (totalvaluebs/1024/1024) << " MB)" << std::endl;
2372  *OUT << "Mean key bytesize: " << (totalkeybs / (float) this->size()) << std::endl;
2373  *OUT << "Mean value bytesize: " << (totalvaluebs / (float) this->size()) << std::endl;
2374  *OUT << "Mean index length (ttr): " << (indexlengthsum / (float) this->size()) << std::endl;
2375 
2376  unsigned int ri_totalkeybs = 0;
2377  unsigned int ri_totalvaluebs = 0;
2378  if (this->reverseindex) {
2379  for (IndexedCorpus::iterator iter = this->reverseindex->begin(); iter != this->reverseindex->end(); iter++) {
2380  ri_totalkeybs += sizeof(iter->first.sentence) + sizeof(iter->first.token);
2381  ri_totalvaluebs += sizeof(IndexPattern); // sizeof(Pattern) + iter->pattern().bytesize();
2382  }
2383  *OUT << "Total key bytesize in reverse index (references): " << ri_totalkeybs << " bytes (" << (ri_totalkeybs/1024/1024) << " MB)" << std::endl;
2384  *OUT << "Total value bytesize in reverse index (patterns): " << ri_totalvaluebs << " bytes (" << (ri_totalvaluebs/1024/1024) << " MB)" << std::endl;
2385  }
2386 
2387  const unsigned int t = (totalkeybs + totalvaluebs + ri_totalkeybs + ri_totalvaluebs);
2388  *OUT << "Total bytesize (without overhead): " << t << " bytes (" << (t/1024/1024) << " MB)" << std::endl;
2389  }
2390 
2391 
2398  void print(std::ostream * out, ClassDecoder & decoder) {
2399  bool haveoutput = false;
2400  for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {
2401  if (!haveoutput) {
2402  *out << "PATTERN\tCOUNT\tTOKENS\tCOVERAGE\tCATEGORY\tSIZE\tFREQUENCY\tREFERENCES" << std::endl;
2403  haveoutput = true;
2404  }
2405  const PatternPointer pattern = iter->first;
2406  this->print(out, decoder, pattern, true);
2407  }
2408  if (haveoutput) {
2409  std::cerr << std::endl << "Legend:" << std::endl;
2410  std::cerr << " - PATTERN : The pattern, Gaps in skipgrams are represented as {*}. Variable-width gaps in flexgrams are shown using {**}." << std::endl;
2411  std::cerr << " - COUNT : The occurrence count - the amount of times the pattern occurs in the data" << std::endl;
2412  std::cerr << " - TOKENS : The number of tokens in the corpus that this pattern covers" << std::endl;
2413  std::cerr << " - COVERAGE : The number of tokens covered, as a fraction of the total in the corpus" << std::endl;
2414  std::cerr << " - CATEGORY : The pattern type category (ngram,skipgram,flexgram)" << std::endl;
2415  std::cerr << " - SIZE : The size of the pattern (in tokens)" << std::endl;
2416  std::cerr << " - FREQUENCY : The frequency of the pattern *within it's pattern type category and size-class*." << std::endl;
2417  std::cerr << " - REFERENCES : A space-delimited list of sentence:token position where the pattern occurs in the data. Sentences start at 1, tokens at 0" << std::endl;
2418  }
2419  }
2420 
2421  void print(std::ostream* out, ClassDecoder &decoder, const PatternPointer & pattern, bool endline = true) {
2422  const std::string pattern_s = pattern.tostring(decoder);
2423  const unsigned int count = this->occurrencecount(pattern);
2424  const unsigned int covcount = this->coveragecount(pattern);
2425  const double coverage = covcount / (double) this->tokens();
2426  const double freq = this->frequency(pattern);
2427  const int cat = pattern.category();
2428  std::string cat_s;
2429  if (cat == 1) {
2430  cat_s = "ngram";
2431  } else if (cat == 2) {
2432  cat_s = "skipgram";
2433  } else if (cat == 3) {
2434  cat_s = "flexgram";
2435  }
2436  *out << pattern_s << "\t" << count << "\t" << "\t" << covcount << "\t" << coverage << "\t" << cat_s << "\t" << pattern.size() << "\t" << freq << "\t";
2437  IndexedData * data = this->getdata(pattern);
2438  unsigned int i = 0;
2439  for (IndexedData::iterator iter2 = data->begin(); iter2 != data->end(); iter2++) {
2440  i++;
2441  *out << iter2->tostring();
2442  if (i < count) *out << " ";
2443  }
2444  if (endline) *out << std::endl;
2445  }
2446 
2447 
2453  virtual void trainskipgrams(PatternModelOptions options, PatternModelInterface * constrainbymodel = NULL) {
2454  if (options.MINTOKENS == -1) options.MINTOKENS = 2;
2455  this->cache_grouptotal.clear(); //forces recomputation of statistics
2456  for (int n = 3; n <= options.MAXLENGTH; n++) {
2457  if (this->gapmasks[n].empty()) this->gapmasks[n] = compute_skip_configurations(n, options.MAXSKIPS);
2458  if (!options.QUIET) std::cerr << "Counting " << n << "-skipgrams" << std::endl;
2459  int foundskipgrams = 0;
2460  for (typename MapType::iterator iter = this->begin(); iter != this->end(); iter++) {
2461  const PatternPointer pattern = PatternPointer(&(iter->first));
2462  const IndexedData multirefs = iter->second;
2463  if (((int) pattern.n() == n) && (pattern.category() == NGRAM) ) foundskipgrams += this->computeskipgrams(pattern,options, NULL, &multirefs, constrainbymodel, false);
2464  }
2465  if (!foundskipgrams) {
2466  std::cerr << " None found" << std::endl;
2467  break;
2468  } else {
2469  this->hasskipgrams = true;
2470  }
2471  if (!options.QUIET) std::cerr << " Found " << foundskipgrams << " skipgrams...";
2472  unsigned int pruned = this->prune(options.MINTOKENS,n);
2473  if (!options.QUIET) std::cerr << "pruned " << pruned;
2474  unsigned int prunedextra = this->pruneskipgrams(options.MINTOKENS_SKIPGRAMS, options.MINSKIPTYPES, n);
2475  if (prunedextra && !options.QUIET) std::cerr << " plus " << prunedextra << " extra skipgrams..";
2476  if (!options.QUIET) std::cerr << "...total kept: " << foundskipgrams - pruned - prunedextra << std::endl;
2477  }
2478  }
2479 
2485  if (this->reverseindex == NULL) {
2486  std::cerr << "ERROR: getpatternfromtoken() No reverse index loaded" << std::endl;
2487  throw InternalError();
2488  }
2489  return this->reverseindex->getpattern(ref,1);
2490  }
2491 
2492 
2493 
2500  t_relationmap skipcontent; //will hold all skipcontent
2501  if (this->reverseindex == NULL) {
2502  std::cerr << "ERROR: No corpus data loaded! (in PatternModel::getskipcontent)" << std::endl;
2503  throw InternalError();
2504  }
2505 
2506  if (pattern.category() == SKIPGRAM) {
2507  const unsigned int n = pattern.n();
2508  const uint32_t skipcontent_mask = reversemask(pattern.mask,n );
2509 
2510  const int head = maskheadskip(skipcontent_mask, n); //skip at begin
2511  const int tail = masktailskip(skipcontent_mask,n); //skip at end
2512 
2513 
2514  const IndexedData * data = getdata(pattern);
2515  for (IndexedData::const_iterator iter2 = data->begin(); iter2 != data->end(); iter2++) {
2516  const IndexReference ref = *iter2;
2517 
2518  //raw skipcontent with leading and trailing skips
2519  PatternPointer skipcontent_atref_raw = this->reverseindex->getpattern(ref,n);
2520  skipcontent_atref_raw.mask = skipcontent_mask;
2521 
2522  //trim leading and trailing skips
2523  Pattern skipcontent_atref = PatternPointer(skipcontent_atref_raw, head, n-head-tail); //pattern from patternpointer
2524  skipcontent[skipcontent_atref] += 1;
2525  }
2526 
2527  } else if (pattern.category() == FLEXGRAM) {
2528  //TODO: implement
2529  }
2530  //std::cerr << "Total found " << skipcontent.size() << std::endl;
2531  return skipcontent;
2532  }
2533 
2539  void prunerelations(t_relationmap & relations, unsigned int occurrencethreshold) {
2540  t_relationmap::iterator eraseiter;
2541  t_relationmap::iterator iter = relations.begin();
2542  while (iter != relations.end()) {
2543  if (iter->second < occurrencethreshold) {
2544  eraseiter = iter;
2545  iter++;
2546  relations.erase(eraseiter);
2547  } else {
2548  iter++;
2549  }
2550  }
2551  }
2552 
2559  t_relationmap gettemplates(const Pattern & pattern, unsigned int occurrencethreshold = 0) {
2560  //returns patterns that are an abstraction of the specified pattern
2561  //skipgrams
2562  if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {
2563  std::cerr << "ERROR: No reverse index present" << std::endl;
2564  throw InternalError();
2565  }
2566 
2567  IndexedData * data = this->getdata(pattern);
2568  if (data == NULL) {
2569  throw NoSuchPattern();
2570  }
2571 
2572  t_relationmap templates;
2573 
2574 
2575  const int _n = pattern.n();
2576  //search in forward index
2577  for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {
2578  const IndexReference ref = *iter;
2579 
2580  //search in reverse index
2581  std::vector<PatternPointer> rindex = this->getreverseindex(ref);
2582  for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2583  const PatternPointer candidate = *iter2;
2584 
2585  if (((int) candidate.n() == _n) && (candidate != pattern) && (candidate.category() == SKIPGRAM) && ((occurrencethreshold == 0) || (this->occurrencecount(pattern) >= occurrencethreshold)) ) {
2586  templates[candidate] += 1;
2587  }
2588  }
2589  }
2590  if (occurrencethreshold > 0) this->prunerelations(templates, occurrencethreshold);
2591  return templates;
2592  }
2593 
2601  t_relationmap getinstances(const Pattern & pattern, unsigned int occurrencethreshold = 0) {
2602  //returns patterns that instantiate the specified pattern
2603  if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {
2604  std::cerr << "ERROR: No reverse index present" << std::endl;
2605  throw InternalError();
2606  }
2607 
2608  IndexedData * data = this->getdata(pattern);
2609  if (data == NULL) {
2610  throw NoSuchPattern();
2611  }
2612 
2613  t_relationmap instances;
2614 
2615 
2616  const int _n = pattern.n();
2617  //search in forward index
2618  for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {
2619  const IndexReference ref = *iter;
2620 
2621  //search in reverse index
2622  std::vector<PatternPointer> rindex = this->getreverseindex(ref);
2623  for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2624  const PatternPointer candidate = *iter2;
2625 
2626  if (((int) candidate.n() == _n) && (candidate != pattern) && (candidate.category() == NGRAM) && ((occurrencethreshold == 0) || (this->occurrencecount(pattern) >= occurrencethreshold)) ) {
2627  instances[candidate] += 1;
2628  }
2629  }
2630  }
2631  if (occurrencethreshold > 0) this->prunerelations(instances, occurrencethreshold);
2632  return instances;
2633  }
2634 
2642  t_relationmap getsubchildren(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0) {
2643  if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {
2644  std::cerr << "ERROR: No reverse index present" << std::endl;
2645  throw InternalError();
2646  }
2647 
2648  IndexedData * data = this->getdata(pattern);
2649  if (data == NULL) {
2650  throw NoSuchPattern();
2651  }
2652 
2653 
2654 
2655 
2656  t_relationmap subchildren;
2657  const int _n = pattern.n();
2658  const bool isskipgram = (pattern.category() == SKIPGRAM);
2659  //search in forward index
2660  for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {
2661  const IndexReference ref = *iter;
2662  for (int i = ref.token; i < ref.token + _n; i++) {
2664  int maxsubn = _n - (i - ref.token);
2665 
2666  //std::cerr << "Begin " << begin.sentence << ":" << begin.token << ",<< std::endl;
2667 
2668  //search in reverse index
2669  std::vector<PatternPointer> rindex = this->getreverseindex(begin);
2670  for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2671  const PatternPointer candidate = *iter2;
2672  //std::cerr << "Considering candidate @" << ref2.sentence << ":" << ref2.token << ", n=" << candidate.n() << ", bs=" << candidate.bytesize() << std::endl;
2673  //candidate.out();
2674  if (((int) candidate.n() <= maxsubn) && (candidate != pattern)
2675  && ((occurrencethreshold == 0) || (this->occurrencecount(candidate) >= occurrencethreshold))
2676  && ((category == 0) || (candidate.category() >= category))
2677  && ((size == 0) || (candidate.n() >= size))
2678  ) {
2679  if ((isskipgram) || (candidate.category() == SKIPGRAM)) { //MAYBE TODO: I may check too much now... could be more efficient?
2680  //candidate may not have skips in places where the larger pattern does
2681  Pattern tmpl = Pattern(pattern, i, candidate.n()); //get the proper slice to match
2682  if (candidate.instanceof(tmpl)) {
2683  subchildren[candidate] = subchildren[candidate] + 1;
2684  }
2685  } else if (candidate.category() == FLEXGRAM) {
2686  //TODO
2687  } else {
2688  subchildren[candidate]++;
2689  }
2690  }
2691  }
2692  }
2693  }
2694  if (occurrencethreshold > 0) this->prunerelations(subchildren, occurrencethreshold);
2695  return subchildren;
2696  }
2697 
2705  t_relationmap getsubparents(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0) {
2706  //returns patterns that subsume the specified pattern (i.e. larger
2707  //patterns)
2708  if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {
2709  std::cerr << "ERROR: No reverse index present" << std::endl;
2710  throw InternalError();
2711  }
2712 
2713  IndexedData * data = this->getdata(pattern);
2714  if (data == NULL) {
2715  throw NoSuchPattern();
2716  }
2717 
2718  t_relationmap subsumes;
2719  const int _n = pattern.n();
2720  //search in forward index
2721  for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {
2722  const IndexReference ref = *iter;
2723 
2724 
2725  //search in reverse index
2726  std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_bysentence(ref.sentence);
2727  for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2728  if ((iter2->first.sentence != ref.sentence) || (iter2->first.token > ref.token)) break;
2729  const PatternPointer candidate = iter2->second;
2730 
2731  int minsubsize = _n + (ref.token - iter2->first.token);
2732 
2733  if (((int) candidate.n() >= minsubsize) && (candidate != pattern)
2734  && ((occurrencethreshold == 0) || (this->occurrencecount(candidate) >= occurrencethreshold))
2735  && ((category == 0) || (candidate.category() >= category))
2736  && ((size == 0) || (candidate.n() >= size))
2737  ) {
2738  if ((candidate.category() == SKIPGRAM) || (pattern.category() == SKIPGRAM)) {//MAYBE TODO: I may check too much now... could be more efficient?
2739  //instance may not have skips in places where the larger candidate pattern does
2740  Pattern inst = Pattern(candidate, iter2->first.token, pattern.n()); //get the proper slice to match
2741  if (pattern.instanceof(candidate)) {
2742  subsumes[candidate] += 1;
2743  }
2744  } else if (candidate.category() == FLEXGRAM) {
2745  //TODO
2746  } else {
2747  subsumes[candidate] += 1;
2748  }
2749  }
2750  }
2751  }
2752  if (occurrencethreshold > 0) this->prunerelations(subsumes, occurrencethreshold);
2753  return subsumes;
2754  }
2755 
2756 
2764  t_relationmap getleftneighbours(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0, unsigned int cutoff=0) {
2765  if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {
2766  std::cerr << "ERROR: No reverse index present" << std::endl;
2767  throw InternalError();
2768  }
2769 
2770  IndexedData * data = this->getdata(pattern);
2771  if (data == NULL) {
2772  throw NoSuchPattern();
2773  }
2774 
2775  t_relationmap neighbours;
2776  for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {
2777  const IndexReference ref = *iter;
2778 
2779 
2780  std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_bysentence(ref.sentence);
2781  for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2782  const IndexReference ref2 = iter2->first;
2783  const PatternPointer neighbour = iter2->second;
2784  if ((ref2.token + neighbour.n() == ref.token)
2785  && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))
2786  && ((category == 0) || (neighbour.category() >= category))
2787  && ((size == 0) || (neighbour.n() >= size))
2788  ){
2789  neighbours[neighbour]++;
2790  if ((cutoff > 0) && (neighbours.size() >= cutoff)) break;
2791  } else if ((ref2.token > ref.token) || (ref2.sentence > ref.sentence)) break;
2792  }
2793  if ((cutoff > 0) && (neighbours.size() >= cutoff)) break;
2794  }
2795  if (occurrencethreshold > 0) this->prunerelations(neighbours, occurrencethreshold);
2796  return neighbours;
2797  }
2798 
2806  t_relationmap getrightneighbours(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0, unsigned int cutoff=0) {
2807  if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {
2808  std::cerr << "ERROR: No reverse index present" << std::endl;
2809  throw InternalError();
2810  }
2811 
2812  IndexedData * data = this->getdata(pattern);
2813  if (data == NULL) {
2814  throw NoSuchPattern();
2815  }
2816 
2817  t_relationmap neighbours;
2818  for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {
2819  IndexReference ref = *iter;
2820  ref.token += pattern.size();
2821 
2822  //search in reverse index
2823  std::vector<PatternPointer> rindex = this->getreverseindex(ref);
2824  for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2825  const PatternPointer neighbour = *iter2;
2826  if ( ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))
2827  && ((category == 0) || (neighbour.category() >= category))
2828  && ((size == 0) || (neighbour.n() >= size)) ) {
2829  neighbours[neighbour]++;
2830  if ((cutoff > 0) && (neighbours.size() >= cutoff)) break;
2831  }
2832  }
2833  if ((cutoff > 0) && (neighbours.size() >= cutoff)) break;
2834  }
2835  if (occurrencethreshold > 0) this->prunerelations(neighbours, occurrencethreshold);
2836  return neighbours;
2837  }
2838 
2845  int pruneskipgrams(int threshold, int minskiptypes, int _n = 0) {
2846  int pruned = 0;
2847  if (minskiptypes <=1) return pruned; //nothing to do
2848 
2850  while(iter != this->end()) {
2851  const PatternType pattern = iter->first;
2852  if (( (_n == 0) || ((int) pattern.n() == _n) ) && (pattern.category() == SKIPGRAM)) {
2853  t_relationmap skipcontent = getskipcontent(pattern);
2854  t_relationmap skipcontent2 = getskipcontent(pattern); //TODO: remove debug
2855  if (skipcontent2.size() != skipcontent.size()) {
2856  std::cerr << " Pattern " << pattern.hash() << " discrepancy!!! " << skipcontent.size() << " vs " << skipcontent2.size() << std::endl;
2857  throw InternalError();
2858  }
2859  //std::cerr << " Pattern " << pattern.hash() << " occurs: " << this->occurrencecount(pattern) << " skipcontent=" << skipcontent.size() << std::endl;
2860  if ((int) skipcontent.size() < minskiptypes) { //will take care of token threshold too, patterns not meeting the token threshold are not included
2861  //std::cerr << "..pruning" << std::endl;
2862  iter = this->erase(iter);
2863  pruned++;
2864  continue;
2865  }
2866  }
2867  iter++;
2868  }
2869  return pruned;
2870  }
2871 
2877  virtual void computecoveragestats(int category = 0, int n = 0) {
2878  //opting for memory over speed (more iterations, less memory)
2879  //overloaded version for indexedmodel
2880  if ((this->cache_grouptotal.empty()) && (!this->data.empty())) this->computestats();
2881 
2882  if ((this->cache_n.size() == 1) && (*this->cache_n.begin() == 1) && (n <= 1)) {
2883  //special condition, only unigrams, we can be done quicker
2884  this->cache_grouptotalwordtypes[0][1] = this->size();
2886  while (iter != this->end()) {
2887  this->cache_grouptotaltokens[0][1] += this->valuehandler.count(iter->second);
2888  iter++;
2889  }
2890  return;
2891  }
2892 
2893  for (std::set<int>::iterator iterc = this->cache_categories.begin(); iterc != this->cache_categories.end(); iterc++) {
2894  if ((category == 0) || (*iterc == category)) {
2895  for (std::set<int>::iterator itern = this->cache_n.begin(); itern != this->cache_n.end(); itern++) {
2896  if (((n == 0) || (*itern == n)) && (this->cache_grouptotalwordtypes[*iterc][*itern] == 0) ) {
2897  std::unordered_set<Pattern> types;
2898  std::set<IndexReference> tokens;
2900  while (iter != this->end()) {
2901  const Pattern pattern = iter->first;
2902  const int n = pattern.n();
2903  if ( (n == 1) && (*itern <= 1) && ((*iterc == 0) || (pattern.category() == *iterc))) {
2904  types.insert(pattern);
2905  } else {
2906  if (((*itern == 0) || (n == *itern)) && ((*iterc == 0) || (pattern.category() == *iterc))) {
2907  std::vector<Pattern> unigrams;
2908  pattern.ngrams(unigrams, 1);
2909  for (std::vector<Pattern>::iterator iter2 = unigrams.begin(); iter2 != unigrams.end(); iter2++) {
2910  const Pattern p = *iter2;
2911  types.insert(p);
2912  }
2913  }
2914  }
2915  IndexedData * data = this->getdata(pattern);
2916  for (IndexedData::iterator dataiter = data->begin(); dataiter != data->end(); dataiter++) {
2917  //take into account all tokens
2918  for (unsigned int i = 0; i < pattern.n(); i++) {
2919  tokens.insert(*dataiter + i);
2920  }
2921  }
2922  iter++;
2923  }
2924  this->cache_grouptotalwordtypes[*iterc][*itern] += types.size();
2925  this->cache_grouptotaltokens[*iterc][*itern] += tokens.size();
2926  }
2927  }
2928  }
2929  }
2930  }
2931 
2939  t_relationmap getrightcooc(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0,IndexedData * matches = NULL) {
2940  if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {
2941  std::cerr << "ERROR: No reverse index present" << std::endl;
2942  throw InternalError();
2943  }
2944 
2945  IndexedData * data = this->getdata(pattern);
2946  if (data == NULL) {
2947  throw NoSuchPattern();
2948  }
2949 
2950  const int _n = pattern.n();
2951  t_relationmap cooc;
2952  //find everything that co-occurs *without overlap* TO THE RIGHT
2953  for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {
2954  const IndexReference ref = *iter;
2955 
2956 
2957  std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_right(ref);
2958  for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2959  const IndexReference ref2 = iter2->first;
2960  const PatternPointer neighbour = iter2->second;
2961  if ( (ref2.token > ref.token + _n)
2962  && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))
2963  && ((category == 0) || (neighbour.category() >= category))
2964  && ((size == 0) || (neighbour.n() >= size))
2965  ) {
2966  cooc[neighbour]++;
2967  if (matches != NULL) matches->insert(ref2);
2968  }
2969  }
2970  }
2971  if (occurrencethreshold > 0) this->prunerelations(cooc, occurrencethreshold);
2972  return cooc;
2973  }
2974 
2975 
2983  t_relationmap getleftcooc(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0) {
2984  if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {
2985  std::cerr << "ERROR: No reverse index present" << std::endl;
2986  throw InternalError();
2987  }
2988 
2989  IndexedData * data = this->getdata(pattern);
2990  if (data == NULL) {
2991  throw NoSuchPattern();
2992  }
2993 
2994  t_relationmap cooc;
2995  //find everything that co-occurs *without overlap* TO THE LEFT
2996  for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {
2997  const IndexReference ref = *iter;
2998 
2999  std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_left(ref);
3000  for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
3001  const IndexReference ref2 = iter2->first;
3002  const PatternPointer neighbour = iter2->second;
3003  const int _n = neighbour.n();
3004  if ( (ref2.token + _n < ref.token )
3005  && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))
3006  && ((category == 0) || (neighbour.category() >= category))
3007  && ((size == 0) || (neighbour.n() >= size))
3008  ) {
3009  cooc[neighbour]++;
3010  }
3011  }
3012  }
3013  if (occurrencethreshold > 0) this->prunerelations(cooc, occurrencethreshold);
3014  return cooc;
3015  }
3016 
3017 
3026  t_relationmap getcooc(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0,bool ordersignificant = false) {
3027  if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {
3028  std::cerr << "ERROR: No reverse index present" << std::endl;
3029  throw InternalError();
3030  }
3031 
3032  IndexedData * data = this->getdata(pattern);
3033  if (data == NULL) {
3034  throw NoSuchPattern();
3035  }
3036 
3037  const int _n = pattern.n();
3038  t_relationmap cooc;
3039  //find everything that co-occurs *without overlap* TO THE RIGHT
3040  for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {
3041  const IndexReference ref = *iter;
3042 
3043 
3044  std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_bysentence(ref.sentence);
3045  for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
3046  const IndexReference ref2 = iter2->first;
3047  const PatternPointer neighbour = iter2->second;
3048  if ((ordersignificant) && (neighbour.pattern() < pattern)) continue;
3049  const int _n2 = neighbour.n();
3050  if ( ((ref2.token + _n2 < ref.token ) || (ref2.token > ref.token + _n))
3051  && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))
3052  && ((category == 0) || (neighbour.category() >= category))
3053  && ((size == 0) || (neighbour.n() >= size))
3054  ) {
3055  cooc[neighbour]++;
3056  }
3057  }
3058  }
3059  if (occurrencethreshold > 0) this->prunerelations(cooc, occurrencethreshold);
3060  return cooc;
3061  }
3062 
3067  double npmi(const PatternPointer & key1, const PatternPointer & key2, int jointcount) {
3068  //normalised pointwise mutual information
3069  return log( (double) jointcount / (this->occurrencecount(key1) * this->occurrencecount(key2)) ) / -log((double)jointcount/(double)this->totaloccurrencesingroup(0,0) );
3070  }
3071 
3080  void outputrelations(const PatternPointer & pattern, t_relationmap & relations, ClassDecoder & classdecoder, std::ostream *OUT, const std::string label = "RELATED-TO") {
3081  int total = 0;
3082  for (t_relationmap::iterator iter = relations.begin(); iter != relations.end(); iter++) {
3083  total += iter->second;
3084  }
3085  if (total == 0) return;
3086  double total_f = total;
3087  const std::string pattern_s = pattern.tostring(classdecoder);
3088  for (t_relationmap::iterator iter = relations.begin(); iter != relations.end(); iter++) {
3089  const PatternPointer pattern2 = iter->first;
3090  *OUT << "\t" << pattern_s << "\t" << label << "\t" << pattern2.tostring(classdecoder) << "\t" << iter->second << "\t" << iter->second / total_f << "\t" << this->occurrencecount(pattern2) << std::endl;
3091  }
3092  }
3093 
3101  void outputrelations(const PatternPointer & pattern, ClassDecoder & classdecoder, std::ostream * OUT, bool outputheader=true) {
3102  if (outputheader) *OUT << "#\tPATTERN1\tRELATION\tPATTERN2\tREL.COUNT\tREL.FREQUENCY\tCOUNT2" << std::endl;
3103  {
3104  t_relationmap relations = this->getsubparents(pattern);
3105  this->outputrelations(pattern, relations, classdecoder, OUT, "SUBSUMED-BY");
3106  }
3107  {
3108  t_relationmap relations = this->getsubchildren(pattern);
3109  this->outputrelations(pattern, relations, classdecoder, OUT, "SUBSUMES");
3110  }
3111  {
3112  t_relationmap relations = this->getleftneighbours(pattern);
3113  this->outputrelations(pattern, relations, classdecoder, OUT, "RIGHT-NEIGHBOUR-OF");
3114  }
3115  {
3116  t_relationmap relations = this->getrightneighbours(pattern);
3117  this->outputrelations(pattern, relations, classdecoder, OUT, "LEFT-NEIGHBOUR-OF");
3118  }
3119  {
3120  t_relationmap relations = this->getrightcooc(pattern);
3121  this->outputrelations(pattern, relations, classdecoder, OUT, "LEFT-COOC-OF");
3122  }
3123  {
3124  t_relationmap relations = this->getleftcooc(pattern);
3125  this->outputrelations(pattern, relations, classdecoder, OUT, "RIGHT-COOC-OF");
3126  }
3127  if (pattern.category() == SKIPGRAM) {
3128  t_relationmap relations = this->getskipcontent(pattern);
3129  this->outputrelations(pattern, relations, classdecoder, OUT, "INSTANTIATED-BY");
3130  }
3131  }
3132 
3133 
3134  /*
3135  * Compute co-occurence as normalised pointwise mutual information for all patterns
3136  * @param coocmap The map that will store the results
3137  * @param threshold Only include pairs passing this NPMI threshold
3138  * @param right Compute co-occurence to the right (default: true)
3139  * @param left Compute co-occurence to the left (default: true)
3140  */
3141  void computenpmi( std::map<PatternPointer,t_relationmap_double> & coocmap , double threshold, bool right=true, bool left=true) {
3142  //compute npmi co-occurrence for all patterns
3143 
3144  for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {
3145  const PatternType pattern = iter->first;
3146  t_relationmap tmp;
3147  if ((right)&&(!left)) {
3148  tmp = this->getrightcooc(pattern);
3149  } else if ((left)&&(!right)) {
3150  tmp = this->getleftcooc(pattern);
3151  } else if (left && right) { //order not relevant
3152  tmp = this->getcooc(pattern);
3153  }
3154  for (t_relationmap::iterator iter2 = tmp.begin(); iter2 != tmp.end(); iter2++) {
3155  const PatternPointer pattern2 = iter2->first;
3156  const double value = npmi(pattern,pattern2,iter2->second);
3157  if (value >= threshold) coocmap[pattern][pattern2] = value;
3158  }
3159  }
3160  }
3161 
3169  void computecooc( std::map<PatternPointer,t_relationmap> & coocmap , int threshold, bool right=true, bool left=true) {
3170  for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {
3171  const PatternType pattern = iter->first;
3172  t_relationmap tmp;
3173  if ((right)&&(!left)) {
3174  tmp = this->getrightcooc(pattern, threshold);
3175  } else if ((left)&&(!right)) {
3176  tmp = this->getleftcooc(pattern, threshold);
3177  } else if (left && right) { //order not relevant
3178  tmp = this->getcooc(pattern, threshold);
3179  }
3180  for (t_relationmap::iterator iter2 = tmp.begin(); iter2 != tmp.end(); iter2++) {
3181  const PatternPointer pattern2 = iter2->first;
3182  const double value = iter2->second;
3183  if (value >= threshold) coocmap[pattern][pattern2] = value;
3184  }
3185  }
3186  }
3187 
3193  this->cache_grouptotal.clear(); //forces recomputation of statistics
3194  int count = 0;
3195  for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {
3196  const PatternType pattern = iter->first;
3197  if (pattern.category() == SKIPGRAM) {
3198  const PatternType flexgram = pattern.toflexgram();
3199  if (!this->has(flexgram)) count++;
3200  //copy data from pattern
3201  IndexedData * data = this->getdata(pattern);
3202  for (IndexedData::iterator iter2 = data->begin(); iter2 != data->end(); iter2++) {
3203  const IndexReference ref = *iter2;
3204  this->data[flexgram].insert(ref);
3205  }
3206  }
3207  }
3208  return count;
3209  }
3210 
3211 
3217  int computeflexgrams_fromcooc(double threshold) { //TODO: won't work in pattern pointer model
3218  this->cache_grouptotal.clear(); //forces recomputation of statistics
3219  int found = 0;
3220  const unsigned char dynamicgap = 129;
3221  const Pattern dynamicpattern = Pattern(&dynamicgap,1);
3222  for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {
3223  const PatternType pattern = iter->first;
3224  IndexedData matches;
3225  t_relationmap tmp = this->getrightcooc(pattern, 0,0,0, &matches);
3226  for (t_relationmap::iterator iter2 = tmp.begin(); iter2 != tmp.end(); iter2++) {
3227  const PatternPointer pattern2 = iter2->first;
3228  const double value = npmi(pattern,pattern2,iter2->second);
3229  if (value >= threshold) {
3230  const Pattern flexgram = pattern + dynamicpattern + pattern2;
3231  if (!this->has(flexgram)) found++;
3232  this->data[flexgram] = value;
3233  }
3234  }
3235  }
3236  return found;
3237  }
3238 
3243  void outputcooc_npmi(std::ostream * OUT, ClassDecoder& classdecoder, double threshold) {
3244  std::map<PatternPointer,t_relationmap_double> npmimap;
3245  std::cerr << "Collecting patterns and computing NPMI..." << std::endl;
3246  computenpmi(npmimap, threshold);
3247 
3248  std::cerr << "Building inverse map..." << std::endl;
3249  //we want the reverse, so we can sort by co-occurrence
3250  std::multimap<double,std::pair<PatternPointer,PatternPointer>> inversemap;
3251  std::map<PatternPointer,t_relationmap_double>::iterator iter = npmimap.begin();
3252  while (iter != npmimap.end()) {
3253  for (t_relationmap_double::iterator iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++) {
3254  inversemap.insert(std::pair<double,std::pair<PatternPointer,PatternPointer>>(iter2->second, std::pair<Pattern,Pattern>(iter->first, iter2->first)));
3255  }
3256  iter = npmimap.erase(iter);
3257  }
3258 
3259  *OUT << "Pattern1\tPattern2\tNPMI" << std::endl;
3260  for (std::multimap<double,std::pair<PatternPointer,PatternPointer>>::reverse_iterator iter2 = inversemap.rbegin(); iter2 != inversemap.rend(); iter2++) {
3261  const PatternPointer pattern1 = iter2->second.first;
3262  const PatternPointer pattern2 = iter2->second.second;
3263  *OUT << pattern1.tostring(classdecoder) << "\t" << pattern2.tostring(classdecoder) << "\t" << iter2->first << std::endl;
3264  }
3265  }
3266 
3271  void outputcooc(std::ostream * OUT, ClassDecoder& classdecoder, double threshold) {
3272  std::map<PatternPointer,t_relationmap> coocmap;
3273  std::cerr << "Collecting patterns and computing co-occurrence..." << std::endl;
3274  computecooc(coocmap, threshold);
3275 
3276  std::cerr << "Building inverse map..." << std::endl;
3277  //we want the reverse, so we can sort by co-occurrence
3278  std::multimap<uint32_t,std::pair<PatternPointer,PatternPointer>> inversemap;
3279  std::map<PatternPointer,t_relationmap>::iterator iter = coocmap.begin();
3280  while (iter != coocmap.end()) {
3281  for (t_relationmap::iterator iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++) {
3282  inversemap.insert(std::pair<uint32_t,std::pair<PatternPointer,PatternPointer>>(iter2->second, std::pair<PatternPointer,PatternPointer>(iter->first, iter2->first)));
3283  }
3284  iter = coocmap.erase(iter);
3285  }
3286 
3287  *OUT << "Pattern1\tPattern2\tCooc" << std::endl;
3288  for (std::multimap<uint32_t,std::pair<PatternPointer,PatternPointer>>::reverse_iterator iter2 = inversemap.rbegin(); iter2 != inversemap.rend(); iter2++) {
3289  const Pattern pattern1 = iter2->second.first;
3290  const Pattern pattern2 = iter2->second.second;
3291  *OUT << pattern1.tostring(classdecoder) << "\t" << pattern2.tostring(classdecoder) << "\t" << iter2->first << std::endl;
3292  }
3293  }
3294 
3300  int flexgramsize(const Pattern & pattern, IndexReference begin) {
3301 
3302  if (pattern.category() != FLEXGRAM) return pattern.n();
3303 
3304  std::vector<Pattern> parts;
3305  int numberofparts = pattern.parts(parts);
3306  bool strictbegin = true;
3307  std::multimap<int, IndexReference> partmatches;
3308  int i = 0;
3309  std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_right(begin); //TODO: Check
3310  for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter = rindex.begin(); iter != rindex.end(); iter++) {
3311  const PatternPointer part = iter->second;
3312  IndexReference ref = iter->first;
3313  partmatches.insert(std::pair<int,IndexReference>(i, ref));
3314  i++;
3315  }
3316 
3317  int firsttoken = begin.token;
3318  IndexReference nextbegin = IndexReference(begin.sentence,999);
3319  for (int j = 0; j < numberofparts; j++) {
3320  //find a path
3321  int prevlevel = -1;
3322  bool found = false;
3323  for (std::multimap<int, IndexReference>::iterator iter = partmatches.lower_bound(j); iter != partmatches.upper_bound(j); iter++) {
3324  found = true;
3325  if (iter->first != prevlevel) {
3326  begin = nextbegin;
3327  nextbegin = IndexReference(begin.sentence,999); //reset
3328  }
3329  if (((iter->second == begin) || (begin < iter->second)) && (iter->second + parts[j].n() + 1 < nextbegin)) {
3330  nextbegin = iter->second + parts[j].n() + 1;
3331  }
3332  prevlevel = iter->first;
3333  }
3334  if (!found) return 0;
3335  }
3336  return (nextbegin.token - firsttoken);
3337  }
3338 
3339 };
3340 
3341 template<class ValueType, class ValueHandler = BaseValueHandler<ValueType>, class MapType = PatternPointerMap<ValueType, BaseValueHandler<ValueType>>>
3342 class PatternPointerModel: public PatternModel<ValueType,ValueHandler,MapType,PatternPointer> {
3343  public:
3344 
3346  this->model_type = this->getmodeltype();
3347  this->model_version = this->getmodelversion();
3348  if (corpus) {
3349  this->reverseindex = corpus;
3350  this->attachcorpus(*corpus);
3351  } else {
3352  this->reverseindex = NULL;
3353  }
3354  }
3355 
3356 
3365  this->model_type = this->getmodeltype();
3366  this->model_version = this->getmodelversion();
3367  if (corpus) {
3368  this->reverseindex = corpus;
3369  this->attachcorpus(*corpus);
3370  } else {
3371  this->reverseindex = NULL;
3372  }
3373  this->load(f,options, constrainmodel);
3374  }
3375 
3383  PatternPointerModel<ValueType,ValueHandler,MapType>(const std::string filename, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL): PatternModel<ValueType,ValueHandler,MapType,PatternPointer>() { //load from file
3384  this->model_type = this->getmodeltype();
3385  this->model_version = this->getmodelversion();
3386  if (corpus) {
3387  this->reverseindex = corpus;
3388  this->attachcorpus(*corpus);
3389  } else {
3390  this->reverseindex = NULL;
3391  }
3392  std::ifstream * in = new std::ifstream(filename.c_str());
3393  this->load( (std::istream *) in, options, constrainmodel);
3394  in->close();
3395  delete in;
3396  }
3397 
3399  int getmodelversion() const { return 2;}
3400 
3408  virtual void add(const PatternPointer & patternpointer, const IndexReference & ref) {
3409  if ((patternpointer.data < this->reverseindex->beginpointer()) || (patternpointer.data > this->reverseindex->beginpointer() + this->reverseindex->bytesize())) {
3410  std::cerr << "Pattern Pointer points outside contained corpus data..." << std::endl;
3411  throw InternalError();
3412  }
3413  ValueType * data = this->getdata(patternpointer, true);
3414  /*std::cerr << "Adding: n="<< patternpointer.n() << ",b=" << patternpointer.bytesize() << ",hash="<<patternpointer.hash()<<", value=" << *data << ",valuetype="<< (size_t) data << ",mask=" << patternpointer.mask << ",pattern=";
3415  patternpointer.out();
3416  std::cerr << std::endl;*/
3417  this->add(patternpointer, data, ref );
3418  /*std::cerr << " Hash recheck: " << patternpointer.hash() << std::endl;
3419  std::cerr << " Pattern hash recheck: " << Pattern(patternpointer).hash() << std::endl;
3420  std::cerr << " Equivalence with Pattern: " << (int) (patternpointer == Pattern(patternpointer)) << std::endl;
3421  std::cerr << " Equivalence with Pattern 2: " << (int) (Pattern(patternpointer) == patternpointer) << std::endl;
3422  std::cerr << " New value verification: " << this->occurrencecount(patternpointer) << " == " << *data << std::endl;
3423  ValueType * data2 = this->getdata(patternpointer, true);
3424  std::cerr << " New value verification (2): " << *data << " == " << *data2 << std::endl;
3425  std::cerr << " New value verification (pointer): " << (size_t) data << " == " << (size_t) data2 << std::endl;*/
3426  }
3427 
3428  virtual void add(const PatternPointer & pattern, ValueType * value, const IndexReference & ref) {
3429  if (value == NULL) {
3430  std::cerr << "Add() value is NULL!" << std::endl;
3431  throw InternalError();
3432  }
3433  this->valuehandler.add(value, ref);
3434  }
3435 
3436 };
3437 
3438 
3439 
3440 template<class MapType=PatternPointerMap<IndexedData, IndexedDataHandler>>
3441 class IndexedPatternPointerModel: public IndexedPatternModel<MapType,PatternPointer> {
3442  public:
3443 
3445  this->model_type = this->getmodeltype();
3446  this->model_version = this->getmodelversion();
3447  if (corpus) {
3448  this->reverseindex = corpus;
3449  this->attachcorpus(*corpus);
3450  } else {
3451  this->reverseindex = NULL;
3452  }
3453  }
3454 
3455 
3463  IndexedPatternPointerModel<MapType>(std::istream *f, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL): IndexedPatternModel<MapType,PatternPointer>(){ //load from file
3464  this->model_type = this->getmodeltype();
3465  this->model_version = this->getmodelversion();
3466  if (corpus) {
3467  this->reverseindex = corpus;
3468  this->attachcorpus(*corpus);
3469  } else {
3470  this->reverseindex = NULL;
3471  }
3472  this->load(f,options, constrainmodel);
3473  }
3474 
3482  IndexedPatternPointerModel<MapType>(const std::string filename, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL): IndexedPatternModel<MapType,PatternPointer>() { //load from file
3483  this->model_type = this->getmodeltype();
3484  this->model_version = this->getmodelversion();
3485  if (corpus) {
3486  this->reverseindex = corpus;
3487  this->attachcorpus(*corpus);
3488  } else {
3489  this->reverseindex = NULL;
3490  }
3491  std::ifstream * in = new std::ifstream(filename.c_str());
3492  this->load( (std::istream *) in, options, constrainmodel);
3493  in->close();
3494  delete in;
3495  }
3496 
3498  int getmodelversion() const { return 2;}
3499 
3507  void add(const PatternPointer & patternpointer, const IndexReference & ref) {
3508  if ((patternpointer.data < this->reverseindex->beginpointer()) || (patternpointer.data > this->reverseindex->beginpointer() + this->reverseindex->bytesize())) {
3509  std::cerr << "Pattern Pointer points outside contained corpus data..." << std::endl;
3510  throw InternalError();
3511  }
3512  IndexedData * data = this->getdata(patternpointer, true);
3513  this->add(patternpointer, data, ref );
3514  }
3515 
3516  void add(const PatternPointer & patternpointer, IndexedData * value, const IndexReference & ref) {
3517  if (value == NULL) {
3518  value = this->getdata(patternpointer,true);
3519  }
3520  this->valuehandler.add(value, ref);
3521  }
3522 };
3523 
3524 double comparemodels_loglikelihood(const Pattern pattern, std::vector<PatternModel<uint32_t>* > & models);
3525 void comparemodels_loglikelihood(std::vector<PatternModel<uint32_t>* > & models, PatternMap<double> * resultmap, bool conjunctiononly = false, std::ostream * output = NULL, ClassDecoder * classdecoder = NULL );
3526 
3527 
3528 #endif
void outputcooc(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:3271
bool out() const
Definition: pattern.cpp:345
virtual int minlength() const =0
void write(std::ostream *out)
Definition: patternmodel.h:438
void print(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:2398
unsigned char version() const
Definition: patternmodel.h:1379
int minn
Definition: patternmodel.h:534
void report(std::ostream *OUT)
Definition: patternmodel.h:2056
virtual t_relationmap gettemplates(const Pattern &pattern, int=0)
Definition: patternmodel.h:2171
virtual t_relationmap getskipcontent(const PatternPointer &pattern)
Definition: patternmodel.h:2173
int maskheadskip(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:68
virtual t_relationmap_double getnpmi(const Pattern &pattern, double threshold)
Definition: patternmodel.h:2176
virtual void posttrain(const PatternModelOptions options)
Definition: patternmodel.h:2203
virtual void printreverseindex(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1883
virtual void load(std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:377
virtual int minlength() const
Definition: patternmodel.h:492
unsigned char type() const
Definition: patternmodel.h:1378
Definition: pattern.h:61
Definition: patternmodel.h:86
virtual t_relationmap getinstances(const Pattern &pattern, int=0)
Definition: patternmodel.h:2172
int MINSKIPTYPES
Minimum required amount of distinct patterns that can fit in a gap of a skipgram for the skipgram to ...
Definition: patternmodel.h:136
size_t size() const
Definition: patternstore.h:597
const size_t size() const
Definition: pattern.h:436
int ngrams(std::vector< PatternPointer > &container, const int n) const
Definition: pattern.cpp:1072
int MAXLENGTH
The maximum length of patterns to be loaded/extracted, inclusive (in words/tokens) (default: 100) ...
Definition: patternmodel.h:126
IndexedData * getdata(const Pattern &pattern, bool makeifnew=false)
Definition: patternmodel.h:2299
Pattern getpatternfromtoken(IndexReference ref)
Definition: patternmodel.h:2484
unsigned int totaltokensingroup(int category, int n)
Definition: patternmodel.h:1656
void printpattern(std::ostream *out, ClassDecoder &decoder, const Pattern &pattern, bool endline=true)
Definition: patternmodel.h:1935
bool erase(const Pattern &pattern)
Definition: patternstore.h:821
IndexedCorpus * reverseindex
Pointer to the reverse index and corpus data for this model (or NULL)
Definition: patternmodel.h:563
std::vector< PatternPointer > getreverseindex(const IndexReference ref, int occurrencecount=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1408
Definition: datatypes.h:477
IndexedData * getdata(const PatternPointer &pattern, bool makeifnew=false)
Definition: patternmodel.h:2310
unsigned int totalpatternsingroup(int category, int n)
Definition: patternmodel.h:1634
virtual int maxlength() const
Definition: patternmodel.h:1312
bool DOREVERSEINDEX
Obsolete now, only here for backward-compatibility with v1.
Definition: patternmodel.h:139
virtual void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:1680
virtual bool has(const Pattern &pattern) const
Definition: patternmodel.h:364
PatternMap< uint32_t, BaseValueHandler< uint32_t >, uint64_t >::iterator t_relationmap_iterator
Definition: patternmodel.h:232
virtual void load(std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:394
virtual double frequency(const Pattern &)=0
bool empty() const
Definition: patternstore.h:270
unsigned char type() const
Definition: patternmodel.h:511
Definition: patternmodel.h:73
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:278
virtual void load(std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:682
t_relationmap getsubchildren(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2642
virtual t_relationmap getsubchildren(const Pattern &pattern, int=0, int=0, int=0)
Definition: patternmodel.h:2169
virtual void add(const PatternPointer &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:1711
const bool isskipgram() const
Definition: pattern.h:170
void test(MapType &target, std::istream *in)
void printmodel(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1902
void outputcooc_npmi(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:3243
void computecooc(std::map< PatternPointer, t_relationmap > &coocmap, int threshold, bool right=true, bool left=true)
Definition: patternmodel.h:3169
int MINTOKENS
Definition: patternmodel.h:113
int getmodelversion() const
Definition: patternmodel.h:2272
Definition: pattern.h:357
bool instanceof(const Pattern &skipgram) const
Definition: pattern.cpp:1533
t_relationmap getleftcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2983
t_relationmap getskipcontent(const PatternPointer &pattern)
Definition: patternmodel.h:2499
std::vector< IndexReference >::iterator iterator
Definition: datatypes.h:109
Definition: pattern.h:54
bool DOPATTERNPERLINE
Assume each line contains one integral pattern, rather than actively extracting all subpatterns on a ...
Definition: patternmodel.h:140
Contains lower-level containers for patterns.
double comparemodels_loglikelihood(const Pattern pattern, std::vector< PatternModel< uint32_t > * > &models)
Definition: patternmodel.cpp:23
virtual void computecoveragestats(int category=0, int n=0)
Definition: patternmodel.h:1569
virtual ValueType * getdata(const Pattern &pattern, bool makeifnew=false)
Definition: patternmodel.h:1343
bool DOREMOVESKIPGRAMS
Remove skip-grams from the model upon loading it.
Definition: patternmodel.h:146
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
virtual void postread(const PatternModelOptions options)
Definition: patternmodel.h:2194
unsigned int sentences() const
Definition: patternstore.h:150
virtual void train(const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:1081
Definition: patternstore.h:156
A pattern model based on an unordered set, does not hold data, only patterns. Very suitable for loadi...
Definition: patternmodel.h:299
PatternPointer getpattern(const IndexReference &begin, int length=1) const
Definition: pattern.cpp:1764
int getmodeltype() const
Definition: patternmodel.h:3497
iterator end()
Definition: patternstore.h:813
virtual int getmodelversion() const
Definition: patternmodel.h:359
ModelType
Definition: patternmodel.h:72
const size_t bytesize() const
Definition: pattern.cpp:57
t_relationmap getcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, bool ordersignificant=false)
Definition: patternmodel.h:3026
t_relationmap getsubparents(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2705
virtual void trainskipgrams(const PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL)
Definition: patternmodel.h:1266
double npmi(const PatternPointer &key1, const PatternPointer &key2, int jointcount)
Definition: patternmodel.h:3067
virtual int getmodeltype() const
Definition: patternmodel.h:653
t_relationmap getinstances(const Pattern &pattern, unsigned int occurrencethreshold=0)
Definition: patternmodel.h:2601
A model mapping patterns to values, gigh-level interface.
Definition: patternmodel.h:526
const size_t n() const
Definition: pattern.cpp:93
int computeflexgrams_fromskipgrams()
Definition: patternmodel.h:3192
int getmodeltype() const
Definition: patternmodel.h:3398
std::unordered_map< Pattern, ValueType >::iterator iterator
Definition: patternstore.h:807
bool DORESET
sets all counts to zero upon loading, clears indices
Definition: patternmodel.h:148
uint64_t totaltokens
Total number of tokens in the original corpus, so INCLUDES TOKENS NOT COVERED BY THE MODEL! ...
Definition: patternmodel.h:530
virtual int maxlength() const =0
bool DOSKIPGRAMS_EXHAUSTIVE
Load/extract skipgrams in an exhaustive fashion? More memory intensive, but the only options for unin...
Definition: patternmodel.h:135
void output(std::ostream *)
vector< pair< int, int > > mask2vector(const uint32_t mask, const int n)
Definition: algorithms.cpp:35
Basic read-only interface for pattern models, abstract base class.
Definition: interface.h:39
virtual t_relationmap getleftneighbours(const Pattern &pattern, int=0, int=0, int=0, int=0)
Definition: patternmodel.h:2174
t_relationmap getrightneighbours(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0)
Definition: patternmodel.h:2806
void computestats()
Definition: patternmodel.h:1526
t_relationmap getleftneighbours(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0)
Definition: patternmodel.h:2764
Limited virtual interface to pattern stores.
Definition: interface.h:20
void info(std::ostream *OUT)
Definition: patternmodel.h:2002
int getmodelversion() const
Definition: patternmodel.h:3399
virtual int getmodelversion() const =0
PatternSetModel(const std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:341
unsigned char model_type
Definition: patternmodel.h:528
virtual int minlength() const
Definition: patternmodel.h:1316
virtual int getmodelversion() const
Definition: patternmodel.h:657
void prunerelations(t_relationmap &relations, unsigned int occurrencethreshold)
Definition: patternmodel.h:2539
virtual unsigned int occurrencecount(const Pattern &pattern)
Definition: patternmodel.h:1321
void histogram(std::map< unsigned int, unsigned int > &hist, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1948
uint64_t totaltokens
Definition: patternmodel.h:303
virtual int getmodeltype() const
Definition: patternmodel.h:358
virtual void posttrain(const PatternModelOptions options)
Definition: patternmodel.h:558
Definition: common.h:43
virtual int computeskipgrams(const PatternPointer &pattern, PatternModelOptions &options, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, const bool exhaustive=false)
Definition: patternmodel.h:1245
void end(Measurement &m)
Definition: benchmarks.cpp:156
int MINTOKENS_UNIGRAMS
Definition: patternmodel.h:121
Class for reading an entire (class encoded) corpus into memory. It provides a reverse index by IndexR...
Definition: patternstore.h:44
PatternMap< uint32_t, BaseValueHandler< uint32_t >, uint64_t > t_relationmap
Definition: patternmodel.h:224
int MINTOKENS_SKIPGRAMS
Definition: patternmodel.h:116
uint64_t totaltypes
Definition: patternmodel.h:304
virtual void computecoveragestats(int category=0, int n=0)
Definition: patternmodel.h:2877
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_right(const IndexReference ref)
Definition: patternmodel.h:1489
void outputrelations(const PatternPointer &pattern, t_relationmap &relations, ClassDecoder &classdecoder, std::ostream *OUT, const std::string label="RELATED-TO")
Definition: patternmodel.h:3080
virtual int maxlength() const
Definition: patternmodel.h:487
Definition: patternmodel.h:88
bool DEBUG
Output extra debug information.
Definition: patternmodel.h:151
PatternSet< uint64_t >::const_iterator const_iterator
Definition: patternmodel.h:482
const PatternCategory category() const
Definition: pattern.cpp:42
ReverseIndexType
Definition: patternmodel.h:85
unsigned char version() const
Definition: patternmodel.h:515
virtual void resetstats()
Definition: patternmodel.h:1559
virtual unsigned int tokens() const =0
bool DOREMOVEFLEXGRAMS
Remove flexgrams from the model upon loading it.
Definition: patternmodel.h:147
bool DOSKIPGRAMS
Load/extract skipgrams? (default: false)
Definition: patternmodel.h:134
Definition: patternmodel.h:3441
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping...
Definition: classdecoder.h:43
MapType::const_iterator const_iterator
Definition: patternmodel.h:1307
Reference to a position in the corpus.
Definition: datatypes.h:33
Definition: patternmodel.h:75
Definition: patternmodel.h:77
A pattern map storing patterns and their values in a hash map (unordered_map).
Definition: patternstore.h:782
virtual ValueType * getdata(const PatternPointer &pattern, bool makeifnew=false)
Definition: patternmodel.h:1354
void insert(const Pattern &pattern, ValueType &value)
Definition: patternstore.h:789
void read(std::istream *in, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:644
unsigned char classencodingversion
Definition: patternstore.h:328
PatternSet< uint64_t > extractset(int minlength=1, int maxlength=1)
Definition: patternmodel.h:2147
unsigned char model_version
Definition: patternmodel.h:529
std::map< int, std::map< int, unsigned int > > cache_grouptotal
total occurrences (used for frequency computation, within a group)
Definition: patternmodel.h:539
int getmodelversion() const
Definition: patternmodel.h:3498
int subngrams(std::vector< PatternPointer > &container, int minn=1, int maxn=9) const
Definition: pattern.cpp:1142
Definition: pattern.h:56
const size_t bytesize() const
Definition: pattern.h:435
t_relationmap gettemplates(const Pattern &pattern, unsigned int occurrencethreshold=0)
Definition: patternmodel.h:2559
PatternSet< uint64_t >::iterator iterator
Definition: patternmodel.h:481
iterator end()
Definition: datatypes.h:115
bool isgap(int i) const
Definition: pattern.cpp:126
virtual bool has(const Pattern &pattern) const
Definition: patternmodel.h:669
void write(std::ostream *out)
Definition: patternstore.h:632
virtual int getmodeltype() const =0
double frequency(const Pattern &pattern)
Definition: patternmodel.h:1666
Definition: patternmodel.h:3342
Definition: patternmodel.h:78
unsigned int totalwordtypesingroup(int category, int n)
Definition: patternmodel.h:1645
virtual int computeflexgrams_fromcooc()
Definition: patternmodel.h:2178
const size_t size() const
Definition: pattern.h:156
MapType::iterator iterator
Definition: patternmodel.h:1306
bool DOREMOVEINDEX
Do not load index information (for indexed models), loads just the patterns without any counts...
Definition: patternmodel.h:144
int maxn
Definition: patternmodel.h:533
PatternModelInterface * getinterface()
Definition: patternmodel.h:465
virtual bool has(const PatternPointer &pattern) const
Definition: patternmodel.h:672
virtual unsigned int types()=0
virtual void add(const Pattern &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:1704
bool QUIET
Don't output to stderr.
Definition: patternmodel.h:150
iterator end()
Definition: patternstore.h:224
unsigned int prune(int threshold, int _n=0)
Definition: patternmodel.h:1728
int MAXBACKOFFLENGTH
Definition: patternmodel.h:127
virtual std::vector< PatternPointer > findskipgrams(const PatternPointer &pattern, unsigned int occurrencethreshold=1, int maxskips=3)
Definition: patternmodel.h:1254
virtual unsigned int occurrencecount(const Pattern &pattern)=0
int MAXSKIPS
Maximum skips per skipgram.
Definition: patternmodel.h:137
virtual bool has(const PatternPointer &pattern) const
Definition: patternmodel.h:367
unsigned char model_type
Definition: patternmodel.h:301
void write(const std::string filename)
Definition: patternmodel.h:1299
Options for Pattern Model loading and training.
Definition: patternmodel.h:111
std::pair< IndexReference, PatternPointer > IndexPattern
Definition: patternstore.h:39
int sentencelength(int sentence) const
Definition: pattern.cpp:1806
int PRUNENONSUBSUMED
Definition: patternmodel.h:142
uint16_t token
Definition: datatypes.h:36
int ngrams(std::vector< Pattern > &container, const int n) const
Definition: pattern.cpp:1050
std::set< int > cache_n
Definition: patternmodel.h:538
size_t size() const
Definition: patternstore.h:800
iterator begin()
Definition: patternstore.h:810
int getmodeltype(const std::string &filename)
Definition: patternmodel.cpp:4
int getmodeltype() const
Definition: patternmodel.h:2271
virtual void outputcooc(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:2180
int subngrams(std::vector< Pattern > &container, int minn=1, int maxn=99) const
Definition: pattern.cpp:1120
Definition: patternmodel.h:76
void insert(IndexReference ref)
Definition: datatypes.h:106
Collection of references to position in the corpus (IndexReference). Used by Indexed Pattern models...
Definition: datatypes.h:86
virtual void print(std::ostream *out, ClassDecoder &decoder, const PatternType &pattern, bool endline=true)
Definition: patternmodel.h:1911
std::map< int, std::vector< uint32_t > > gapmasks
pre-computed masks representing possible gap configurations for various pattern lengths ...
Definition: patternmodel.h:545
bool reverseindex_internal
Definition: patternmodel.h:564
virtual t_relationmap getsubparents(const Pattern &pattern, int=0, int=0, int=0)
Definition: patternmodel.h:2170
virtual double frequency(const Pattern &)
Definition: patternmodel.h:479
PatternSetModel()
Definition: patternmodel.h:311
double coverage(const Pattern &key)
Definition: patternmodel.h:1397
PatternMap< double, BaseValueHandler< double >, uint64_t > t_relationmap_double
Definition: patternmodel.h:230
unsigned int bytesize() const
Definition: patternstore.h:118
uint32_t mask
Definition: pattern.h:362
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_left(const IndexReference ref)
Definition: patternmodel.h:1507
virtual int computeskipgrams(const PatternPointer &pattern, int mintokens=2, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, std::vector< PatternPointer > *targetcontainer=NULL, const bool exhaustive=false, const int maxskips=3, const bool DEBUG=false)
Definition: patternmodel.h:1101
const PatternCategory category() const
Definition: pattern.cpp:46
virtual void outputcooc_npmi(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:2179
virtual void train(std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:778
void add(const PatternPointer &patternpointer, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:3516
PatternPointer getsentence(int sentence) const
Definition: pattern.cpp:1826
bool DOREMOVENGRAMS
Remove n-grams from the model upon loading it.
Definition: patternmodel.h:145
std::set< int > cache_categories
Definition: patternmodel.h:537
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_bysentence(int sentence)
Definition: patternmodel.h:1471
void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:3507
void outputrelations(const PatternPointer &pattern, ClassDecoder &classdecoder, std::ostream *OUT, bool outputheader=true)
Definition: patternmodel.h:3101
void write(const std::string &filename)
Definition: patternmodel.h:455
virtual void train(const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:2329
PatternModelInterface * getinterface()
Definition: patternmodel.h:765
Definition: patternmodel.h:74
iterator begin()
Definition: patternstore.h:214
virtual unsigned int tokens() const
Definition: patternmodel.h:505
unsigned char * data
Definition: pattern.h:360
int parts(std::vector< PatternPointer > &container) const
Definition: pattern.cpp:1337
size_t size()
Definition: patternstore.h:261
Class for encoding plain-text to binary class-encoded data.
void computenpmi(std::map< PatternPointer, t_relationmap_double > &coocmap, double threshold, bool right=true, bool left=true)
Definition: patternmodel.h:3141
const size_t n() const
Definition: pattern.cpp:89
void info(std::ostream *OUT)
Definition: patternmodel.h:2340
int pruneskipgrams(int threshold, int minskiptypes, int _n=0)
Definition: patternmodel.h:2845
unsigned int coveragecount(const Pattern &key)
Definition: patternmodel.h:1389
An indexed model mapping patterns to values, high-level interface. This is a specialised subclass of ...
Definition: patternmodel.h:2192
unsigned int topthreshold(int amount, int category=0, int size=0)
Definition: patternmodel.h:1967
uint32_t reversemask(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:58
void histogram(std::ostream *OUT, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1988
virtual unsigned int occurrencecount(const PatternPointer &pattern)
Definition: patternmodel.h:1330
std::vector< IndexReference >::const_iterator const_iterator
Definition: datatypes.h:110
A pattern store in the form of an unordered set (i.e, no duplicates). Stores only patterns...
Definition: patternstore.h:538
void print(std::ostream *out, ClassDecoder &decoder, const PatternPointer &pattern, bool endline=true)
Definition: patternmodel.h:2421
uint64_t totaltypes
Total number of unigram/word types in the original corpus, SO INCLUDING NOT COVERED BY THE MODEL! ...
Definition: patternmodel.h:531
Definition: common.h:35
unsigned int prunenotinset(const std::unordered_set< Pattern > &s, int _n)
Definition: patternmodel.h:1784
virtual unsigned int types()
Definition: patternmodel.h:498
std::map< int, std::map< int, unsigned int > > cache_grouptotalwordtypes
total covered word types per group
Definition: patternmodel.h:541
virtual void train(std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:2321
int masktailskip(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:77
PatternType
Definition: pattern.h:59
virtual int computeflexgrams_fromskipgrams()
Definition: patternmodel.h:2177
PatternModelOptions(const PatternModelOptions &ref)
Definition: patternmodel.h:188
Definition: patternmodel.h:97
unsigned int prunebymodel(PatternModel< ValueType2, ValueHandler2, MapType2 > &secondmodel)
Definition: patternmodel.h:1811
int MINLENGTH
The minimum length of patterns to be loaded/extracted (in words/tokens) (default: 1) ...
Definition: patternmodel.h:125
void write(std::ostream *out)
Definition: patternmodel.h:1279
virtual unsigned int types()
Definition: patternmodel.h:1368
virtual void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:3408
int maxn
Definition: patternmodel.h:305
Pattern pattern() const
Definition: pattern.h:527
virtual size_t size() const
Definition: patternmodel.h:662
int minn
Definition: patternmodel.h:306
virtual unsigned int occurrencecount(const Pattern &pattern)
Definition: patternmodel.h:473
virtual void load(std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:700
unsigned char getdataversion(std::istream *in)
Definition: classdecoder.cpp:257
void insert(const Pattern &pattern)
Definition: patternstore.h:580
Measurement begin(const string &title)
Definition: benchmarks.cpp:148
iterator begin()
Definition: datatypes.h:112
int parts(std::vector< Pattern > &container) const
Definition: pattern.cpp:1225
PatternModelOptions()
Definition: patternmodel.h:157
virtual void print(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1854
virtual unsigned int tokens() const
Definition: patternmodel.h:1376
unsigned char model_version
Definition: patternmodel.h:302
uint32_t sentence
Definition: datatypes.h:35
std::map< int, std::map< int, unsigned int > > cache_grouptotalpatterns
total distinct patterns per group
Definition: patternmodel.h:540
bool has(const Pattern &pattern) const
Definition: patternstore.h:587
virtual PatternStoreInterface * getstoreinterface()
Definition: patternmodel.h:288
std::vector< std::pair< Pattern, int > > getpatterns(const Pattern &pattern)
Definition: patternmodel.h:1834
PatternMap< double, BaseValueHandler< double >, uint64_t >::iterator t_relationmap_double_iterator
Definition: patternmodel.h:233
virtual t_relationmap getrightneighbours(const Pattern &pattern, int=0, int=0, int=0, int=0)
Definition: patternmodel.h:2175
virtual void trainskipgrams(PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL)
Definition: patternmodel.h:2453
bool hasskipgrams
Does this model have skipgrams?
Definition: patternmodel.h:565
Definition: patternmodel.h:87
int flexgramsize(const Pattern &pattern, IndexReference begin)
Definition: patternmodel.h:3300
std::map< int, std::map< int, unsigned int > > cache_grouptotaltokens
total covered tokens per group
Definition: patternmodel.h:542
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:283
virtual void add(const PatternPointer &patternpointer, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:2288
virtual void postread(const PatternModelOptions options)
Definition: patternmodel.h:547
virtual void add(const PatternPointer &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:3428
virtual size_t size() const
Definition: patternmodel.h:361
vector< uint32_t > compute_skip_configurations(const int n, const int maxskips)
Definition: algorithms.cpp:85
unsigned int totaloccurrencesingroup(int category, int n)
Definition: patternmodel.h:1623
int computeflexgrams_fromcooc(double threshold)
Definition: patternmodel.h:3217
virtual void add(const Pattern &pattern, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:2282
Definition: pattern.h:55
std::string tostring() const
Definition: datatypes.h:72
PatternSetModel(std::istream *f, PatternModelOptions options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:325
virtual void outputrelations(const Pattern &pattern, ClassDecoder &classdecoder, std::ostream *OUT)
Definition: patternmodel.h:2168
t_relationmap getrightcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, IndexedData *matches=NULL)
Definition: patternmodel.h:2939
virtual unsigned int pruneskipgrams(unsigned int threshold, int minskiptypes=2, int _n=0)
Definition: patternmodel.h:1758