capi/html/patternmodel_8h_source.html

 #ifndef PATTERNMODEL_H

 #define PATTERNMODEL_H


 /*****************************

 * Colibri Core

 *   by Maarten van Gompel

 *   Centre for Language Studies

 *   Radboud University Nijmegen

 *

 *   http://proycon.github.io/colibri-core

 *

 *   Licensed under GPLv3

 *****************************/


 #include "patternstore.h"

 #include "classencoder.h"

 #include "algorithms.h"

 #include <limits>

 #include <cmath>

 #include <cstdint>

 #include <map>

 #include <set>

 #include <sstream>

 #include <array>

 #include <exception>

 #include "bz2stream.h"


 enum ModelType {

     UNINDEXEDPATTERNMODEL = 10,

     UNINDEXEDPATTERNPOINTERMODEL = 11,

     INDEXEDPATTERNMODEL = 20,

     INDEXEDPATTERNPOINTERMODEL = 21,

     PATTERNSETMODEL = 30,

     PATTERNALIGNMENTMODEL = 40,

 };


 enum ReverseIndexType {

     NONE = 0,

     QUICK = 1,

     COMPACT = 2,

 };


 int getmodeltype(const std::string & filename);


 class NoSuchPattern: public std::exception {

   virtual const char* what() const throw()

   {

     return "Pattern not found in model";

   }

 };


 class PatternModelOptions {

     public:

         int MINTOKENS;


         int MINTOKENS_SKIPGRAMS;


         int MINTOKENS_UNIGRAMS;

                                 //

         int MINLENGTH;

         int MAXLENGTH;

         int MAXBACKOFFLENGTH;


         bool DOSKIPGRAMS;

         bool DOSKIPGRAMS_EXHAUSTIVE;

         int MINSKIPTYPES;

         int MAXSKIPS;


         bool DOREVERSEINDEX;

         bool DOPATTERNPERLINE;


         int PRUNENONSUBSUMED; //< Prune all n-grams that are not subsumed by higher-order ngrams


         bool DOREMOVEINDEX;

         bool DOREMOVENGRAMS;

         bool DOREMOVESKIPGRAMS;

         bool DOREMOVEFLEXGRAMS;

         bool DORESET;


         bool QUIET;

         bool DEBUG;


         PatternModelOptions() {

             MINTOKENS = -1; //defaults to 2 for building, 1 for loading

             MINTOKENS_SKIPGRAMS = -1; //defaults to MINTOKENS

             MINTOKENS_UNIGRAMS = 1; //defaults to, effectively disabled

             MINLENGTH = 1;

             MAXLENGTH = 100;

             MAXBACKOFFLENGTH = 100;


             MINSKIPTYPES = 2;

             MAXSKIPS = 3;

             DOSKIPGRAMS = false;

             DOSKIPGRAMS_EXHAUSTIVE = false;


             DOREVERSEINDEX = true; //obsolete

             DOPATTERNPERLINE = false;

             DORESET = false;


             DOREMOVEINDEX = false; //only for indexed models

             DOREMOVENGRAMS = false;

             DOREMOVESKIPGRAMS = false;

             DOREMOVEFLEXGRAMS = false;


             PRUNENONSUBSUMED = false;


             DEBUG = false;

             QUIET = false;

         }


         PatternModelOptions(const PatternModelOptions & ref) {

             MINTOKENS = ref.MINTOKENS; //defaults to 2 for building, 1 for loading

             MINTOKENS_UNIGRAMS = ref.MINTOKENS_UNIGRAMS;

             MINTOKENS_SKIPGRAMS = ref.MINTOKENS_SKIPGRAMS; //defaults to 2 for building, 1 for loading

             MINLENGTH = ref.MINLENGTH;

             MAXLENGTH = ref.MAXLENGTH;

             MAXBACKOFFLENGTH = ref.MAXBACKOFFLENGTH;


             MINSKIPTYPES = ref.MINSKIPTYPES;

             MAXSKIPS = ref.MAXSKIPS;

             DOSKIPGRAMS = ref.DOSKIPGRAMS;

             DOSKIPGRAMS_EXHAUSTIVE = ref.DOSKIPGRAMS_EXHAUSTIVE;


             DOREVERSEINDEX = ref.DOREVERSEINDEX;

             DOPATTERNPERLINE = ref.DOPATTERNPERLINE;

             DORESET = ref.DORESET;


             DOREMOVEINDEX = ref.DOREMOVEINDEX; //only for indexed models

             DOREMOVENGRAMS = ref.DOREMOVENGRAMS;

             DOREMOVESKIPGRAMS = ref.DOREMOVESKIPGRAMS;

             DOREMOVEFLEXGRAMS = ref.DOREMOVEFLEXGRAMS;


             PRUNENONSUBSUMED = ref.PRUNENONSUBSUMED;


             DEBUG = ref.DEBUG;

             QUIET = ref.QUIET;

         }


 };


 typedef PatternMap<uint32_t,BaseValueHandler<uint32_t>,uint64_t> t_relationmap;

 typedef PatternMap<double,BaseValueHandler<double>,uint64_t> t_relationmap_double;


 typedef PatternMap<uint32_t,BaseValueHandler<uint32_t>,uint64_t>::iterator t_relationmap_iterator;  //needed for Cython

 typedef PatternMap<double,BaseValueHandler<double>,uint64_t>::iterator t_relationmap_double_iterator;


 class PatternModelInterface: public PatternStoreInterface {

     public:

         virtual int getmodeltype() const=0;


         virtual int getmodelversion() const=0;


         //these are already in PatternStoreInterface:

             //virtual bool has(const Pattern &) const =0;

             //virtual bool has(const PatternPointer &) const =0;

             //virtual size_t size() const =0;


         virtual unsigned int occurrencecount(const Pattern & pattern)=0;


         virtual double frequency(const Pattern &) =0;


         virtual int maxlength() const=0;

         virtual int minlength() const=0;


         virtual unsigned int types() =0;


         virtual unsigned int tokens() const=0;


         virtual PatternStoreInterface * getstoreinterface() {

             return (PatternStoreInterface*) this;

         };

 };


 class PatternSetModel: public PatternSet<uint64_t>, public PatternModelInterface {

     protected:

         unsigned char model_type;

         unsigned char model_version;

         uint64_t totaltokens; //INCLUDES TOKENS NOT COVERED BY THE MODEL!

         uint64_t totaltypes; //TOTAL UNIGRAM TYPES, INCLUDING NOT COVERED BY THE MODEL!

         int maxn;

         int minn;

     public:

         PatternSetModel() {

             totaltokens = 0;

             totaltypes = 0;

             maxn = 0;

             minn = 999;

             model_type = this->getmodeltype();

             model_version = this->getmodelversion();

         }


         PatternSetModel(std::istream *f, PatternModelOptions options, PatternModelInterface * constrainmodel = NULL) {

             totaltokens = 0;

             totaltypes = 0;

             maxn = 0;

             minn = 999;

             model_type = this->getmodeltype();

             model_version = this->getmodelversion();

             this->load(f,options, constrainmodel);

         }


         PatternSetModel(const std::string & filename, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) {

             totaltokens = 0;

             totaltypes = 0;

             maxn = 0;

             minn = 999;

             model_type = this->getmodeltype();

             model_version = this->getmodelversion();

             if (!options.QUIET) std::cerr << "Loading " << filename << std::endl;

             std::ifstream * in = new std::ifstream(filename.c_str());

             if (!in->good()) {

                 std::cerr << "ERROR: Unable to load file " << filename << std::endl;

                 throw InternalError();

             }

             this->load( (std::istream *) in, options, constrainmodel);

             in->close();

             delete in;

         }

         virtual int getmodeltype() const { return PATTERNSETMODEL; }

         virtual int getmodelversion() const { return 2; }


         virtual size_t size() const {

             return PatternSet<uint64_t>::size();

         }

         virtual bool has(const Pattern & pattern) const {

             return PatternSet<uint64_t>::has(pattern);

         }

         virtual bool has(const PatternPointer & pattern) const {

             return PatternSet<uint64_t>::has(pattern);

         }


         virtual void load(std::string & filename, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) {

             if (!options.QUIET) std::cerr << "Loading " << filename << " as set-model" << std::endl;

             std::ifstream * in = new std::ifstream(filename.c_str());

             if (!in->good()) {

                 std::cerr << "ERROR: Unable to load file " << filename << std::endl;

                 throw InternalError();

             }

             this->load( (std::istream *) in, options, constrainmodel);

             in->close();

             delete in;

         }


         virtual void load(std::istream * f, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) { //load from file

             char null;

             f->read( (char*) &null, sizeof(char));

             f->read( (char*) &model_type, sizeof(char));

             f->read( (char*) &model_version, sizeof(char));

             if (model_version == 1) this->classencodingversion = 1;

             if ((null != 0) || ((model_type != UNINDEXEDPATTERNMODEL) && (model_type != INDEXEDPATTERNMODEL) && (model_type != PATTERNSETMODEL) && (model_type != PATTERNALIGNMENTMODEL) ))  {

                 std::cerr << "ERROR: File is not a colibri patternmodel file" << std::endl;

                 throw InternalError();

             }

             if (model_version > 2) {

                 std::cerr << "WARNING: Model is created with a newer version of Colibri Core! Attempting to continue but failure is likely..." << std::endl;

             }

             f->read( (char*) &totaltokens, sizeof(uint64_t));

             f->read( (char*) &totaltypes, sizeof(uint64_t));


             PatternStoreInterface * constrainstore = NULL;

             if (constrainmodel) constrainstore = constrainmodel->getstoreinterface();


             if (options.DEBUG) {

                 std::cerr << "Debug enabled, loading PatternModel type " << (int) model_type << ", version " << (int) model_version << ", classencodingversion" << (int) this->classencodingversion << std::endl;

                 std::cerr << "Total tokens: " << totaltokens << ", total types: " << totaltypes << std::endl;;

             }

             if (model_type == PATTERNSETMODEL) {

                 //reading set

                 PatternSet<uint64_t>::read(f, options.MINLENGTH, options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS); //read PatternStore

             } else if (model_type == INDEXEDPATTERNMODEL) {

                 //reading from indexed pattern model, ok:

                  readmap<IndexedData,IndexedDataHandler>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore,  !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS);

             } else if (model_type == UNINDEXEDPATTERNMODEL)  {

                 //reading from unindexed pattern model, ok:

                  readmap<uint32_t,BaseValueHandler<uint32_t>>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS);

             } else if (model_type == PATTERNALIGNMENTMODEL)  {

                  //ok:

                  readmap<PatternFeatureVectorMap<double>, PatternFeatureVectorMapHandler<double>>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore,  !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS);

             } else {

                 std::cerr << "ERROR: Unknown model type" << std::endl;

                 throw InternalError();

             }

         }


         void write(std::ostream * out) {

             const char null = 0;

             out->write( (char*) &null, sizeof(char));

             unsigned char t = this->getmodeltype();

             out->write( (char*) &t, sizeof(char));

             unsigned char v = this->getmodelversion();

             out->write( (char*) &v, sizeof(char));

             out->write( (char*) &totaltokens, sizeof(uint64_t));

             const uint64_t tp = this->types(); //use this instead of totaltypes, as it may need to be computed on-the-fly still

             out->write( (char*) &tp, sizeof(uint64_t));

             PatternSet<uint64_t>::write(out); //write

         }


         void write(const std::string & filename) {

             std::ofstream * out = new std::ofstream(filename.c_str());

             this->write(out);

             out->close();

             delete out;

         }


         PatternModelInterface * getinterface() {

             return (PatternModelInterface*) this;

         }


         virtual unsigned int occurrencecount(const Pattern & pattern) { return 0;  }


         virtual double frequency(const Pattern &) { return 0; }


         typedef typename PatternSet<uint64_t>::iterator iterator;

         typedef typename PatternSet<uint64_t>::const_iterator const_iterator;


         virtual int maxlength() const { return maxn; };


         virtual int minlength() const { return minn; };


         virtual unsigned int types()  {

             return totaltypes;

         }

         virtual unsigned int tokens() const { return totaltokens; }


         unsigned char type() const { return model_type; }

         unsigned char version() const { return model_version; }

 };


 template<class ValueType, class ValueHandler = BaseValueHandler<ValueType>, class MapType = PatternMap<ValueType, BaseValueHandler<ValueType>>, class PatternType = Pattern>

 class PatternModel: public MapType, public PatternModelInterface {

     protected:

         unsigned char model_type;

         unsigned char model_version;

         uint64_t totaltokens;

         uint64_t totaltypes;


         int maxn;

         int minn;


         //std::multimap<IndexReference,Pattern> reverseindex;

         std::set<int> cache_categories;

         std::set<int> cache_n;

         std::map<int,std::map<int,unsigned int>> cache_grouptotal;

         std::map<int,std::map<int,unsigned int>> cache_grouptotalpatterns ;

         std::map<int,std::map<int,unsigned int>> cache_grouptotalwordtypes;

         std::map<int,std::map<int,unsigned int>> cache_grouptotaltokens;


         std::map<int, std::vector< uint32_t > > gapmasks;


         virtual void postread(const PatternModelOptions options) {

             //this function has a specialisation specific to indexed pattern models,

             //this is the generic version

             for (iterator iter = this->begin(); iter != this->end(); iter++) {

                 const PatternType p = iter->first;

                 const int n = p.n();

                 if (n > maxn) maxn = n;

                 if (n < minn) minn = n;

                 if ((!hasskipgrams) && (p.isskipgram())) hasskipgrams = true;

             }

         }

         virtual void posttrain(const PatternModelOptions options) {

             //nothing to do here, indexed model specialised this function to

             //sort indices

         }

     public:

         IndexedCorpus * reverseindex;

         bool reverseindex_internal;

         bool hasskipgrams;


         PatternModel<ValueType,ValueHandler,MapType,PatternType>(IndexedCorpus * corpus = NULL) {

             totaltokens = 0;

             totaltypes = 0;

             maxn = 0;

             minn = 999;

             hasskipgrams = false;

             model_type = this->getmodeltype();

             model_version = this->getmodelversion();

             if (corpus) {

                 this->reverseindex = corpus;

                 this->attachcorpus(*corpus);

             } else {

                 this->reverseindex = NULL;

             }

             reverseindex_internal = false;

         }


         PatternModel<ValueType,ValueHandler,MapType,PatternType>(std::istream *f, PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL) {

             totaltokens = 0;

             totaltypes = 0;

             maxn = 0;

             minn = 999;

             hasskipgrams = false;

             model_type = this->getmodeltype();

             model_version = this->getmodelversion();

             this->load(f,options,constrainmodel);

             if (corpus) {

                 this->reverseindex = corpus;

                 this->attachcorpus(*corpus);

             } else {

                 this->reverseindex = NULL;

             }

             reverseindex_internal = false;

         }


         ~PatternModel<ValueType,ValueHandler,MapType,PatternType>() {

             if (reverseindex_internal && reverseindex != NULL) delete reverseindex;

         }

         PatternModel<ValueType,ValueHandler,MapType,PatternType>(const std::string & filename, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL) { //load from file

             //IndexedPatternModel will overload this

             totaltokens = 0;

             totaltypes = 0;

             maxn = 0;

             minn = 999;

             hasskipgrams = false;

             model_type = this->getmodeltype();

             model_version = this->getmodelversion();

             if (corpus) {

                 this->reverseindex = corpus;

                 this->attachcorpus(*corpus);

             } else {

                 this->reverseindex = NULL;

             }

             reverseindex_internal = false;

             if (!options.QUIET) std::cerr << "Loading " << filename << std::endl;

             std::ifstream * in = new std::ifstream(filename.c_str());

             if (!in->good()) {

                 std::cerr << "ERROR: Unable to load file " << filename << std::endl;

                 throw InternalError();

             }

             this->load( (std::istream *) in, options, constrainmodel);

             in->close();

             delete in;

         }


         virtual int getmodeltype() const { return UNINDEXEDPATTERNMODEL; }

         virtual int getmodelversion() const { return 2; }


         virtual size_t size() const {

             return MapType::size();

         }


         virtual bool has(const Pattern & pattern) const {

             return MapType::has(pattern);

         }

         virtual bool has(const PatternPointer & pattern) const {

             return MapType::has(pattern);

         }


         virtual void load(std::string & filename, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) {

             if (!options.QUIET) std::cerr << "Loading " << filename << std::endl;

             std::ifstream * in = new std::ifstream(filename.c_str());

             if (!in->good()) {

                 std::cerr << "ERROR: Unable to load file " << filename << std::endl;

                 throw InternalError();

             }

             this->load( (std::istream *) in, options, constrainmodel);

             in->close();

             delete in;

         }


         virtual void load(std::istream * f, const PatternModelOptions & options, PatternModelInterface * constrainmodel = NULL) { //load from file

             char null;

             f->read( (char*) &null, sizeof(char));

             f->read( (char*) &model_type, sizeof(char));

             f->read( (char*) &model_version, sizeof(char));

             if (model_version == 1) this->classencodingversion = 1;

             if ((null != 0) || ((model_type != UNINDEXEDPATTERNMODEL) && (model_type != UNINDEXEDPATTERNPOINTERMODEL) && (model_type != INDEXEDPATTERNMODEL) && (model_type != INDEXEDPATTERNPOINTERMODEL) && (model_type != PATTERNALIGNMENTMODEL) ))  {

                 std::cerr << "File is not a colibri model file (or a very old one)" << std::endl;

                 throw InternalError();

             }

             if (model_version > 2) {

                 std::cerr << "WARNING: Model is created with a newer version of Colibri Core! Attempting to continue but failure is likely..." << std::endl;

             }

             if (options.DEBUG) {

                 std::cerr << "Debug enabled, loading PatternModel type " << (int) model_type << ", version " << (int) model_version << ", classencodingversion=" << (int) this->classencodingversion << std::endl;

             }

             if ((model_type == UNINDEXEDPATTERNPOINTERMODEL) || (model_type == INDEXEDPATTERNPOINTERMODEL)) {

                 this->patterntype = PATTERNPOINTER;

                 if (options.DEBUG) std::cerr << "Reading corpus data" << std::endl;

                 unsigned int corpussize;

                 f->read( (char*) &corpussize, sizeof(unsigned int));

                 unsigned char * corpusdata = new unsigned char[corpussize];

                 f->read((char*) corpusdata,sizeof(unsigned char) * corpussize);

                 reverseindex = new IndexedCorpus(corpusdata, corpussize);

                 this->attachcorpus(*reverseindex);

                 reverseindex_internal = true;

                 if (options.DEBUG) std::cerr << "(read " << corpussize << " bytes)" << std::endl;

             }

             f->read( (char*) &totaltokens, sizeof(uint64_t));

             f->read( (char*) &totaltypes, sizeof(uint64_t));


             PatternStoreInterface * constrainstore = NULL;

             if (constrainmodel) constrainstore = constrainmodel->getstoreinterface();


             if (options.DEBUG) {

                 std::cerr << "Total tokens: " << totaltokens << ", total types: " << totaltypes << std::endl;;

             }


             if (((model_type == INDEXEDPATTERNMODEL) && (this->getmodeltype() == UNINDEXEDPATTERNMODEL)) || ((model_type == INDEXEDPATTERNPOINTERMODEL) && (this->getmodeltype() == UNINDEXEDPATTERNPOINTERMODEL)))  {

                 //reading indexed pattern model as unindexed, (or indexed patternPOINTErmodels as unindexed patternPOINTERmodels)

                  MapType::template read<IndexedData,IndexedDataHandler,PatternType>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore,  !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET,   options.DEBUG);

             } else if ((model_type == UNINDEXEDPATTERNMODEL) && (this->getmodeltype() == INDEXEDPATTERNMODEL)) {

                //reading unindexed model as indexed, this will load the patterns but lose all the counts

                  MapType::template read<uint32_t,BaseValueHandler<uint32_t>,PatternType>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET,   options.DEBUG);

             } else if ((model_type == UNINDEXEDPATTERNPOINTERMODEL) && (this->getmodeltype() == UNINDEXEDPATTERNMODEL)) {

                  //reading unindexed pointermodel as unindexed patternmodel

                  MapType::template read<uint32_t,BaseValueHandler<uint32_t>,PatternPointer>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET,   options.DEBUG);

             } else if ((model_type == INDEXEDPATTERNPOINTERMODEL) && ((this->getmodeltype() == INDEXEDPATTERNMODEL) || (this->getmodeltype() == UNINDEXEDPATTERNMODEL))) {

                  //reading indexed patternpointermodel as (un)indexed patternmodel

                  MapType::template read<IndexedData,IndexedDataHandler,PatternPointer>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore,  !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET,   options.DEBUG);

             } else if (model_type == PATTERNALIGNMENTMODEL)  {

                  //reading pattern alignment model as pattern model, can be

                  //done, but semantics change:  count corresponds to the number of distinct alignments (for unindexed models)

                  //indexed models will lose all counts

                 MapType::template read<PatternFeatureVectorMap<double>,PatternFeatureVectorMapHandler<double>,PatternType>(f, options.MINTOKENS, options.MINLENGTH,options.MAXLENGTH, constrainstore,  !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS,options.DORESET,  options.DEBUG);

             } else {

                  MapType::template read(f, options.MINTOKENS,options.MINLENGTH, options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET,  options.DEBUG); //read PatternStore (also works for reading unindexed pattern models as indexed, which will load patterns but lose the counts)

             }

             this->postread(options);

         }


         PatternModelInterface * getinterface() {

             return (PatternModelInterface*) this;

         }


         virtual void train(std::istream * in , PatternModelOptions options,  PatternModelInterface * constrainbymodel = NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) {

             if (options.MINTOKENS == -1) options.MINTOKENS = 2;

             if (options.MINTOKENS == 0)  options.MINTOKENS = 1;

             if (options.MINTOKENS_SKIPGRAMS < options.MINTOKENS) options.MINTOKENS_SKIPGRAMS = options.MINTOKENS;

             if (constrainbymodel == this) {

                 totaltypes = 0;

                 totaltokens = 0;

             } else if (constrainbymodel != NULL) {

                 totaltypes = constrainbymodel->types();

                 totaltokens = constrainbymodel->tokens();

             }

             uint32_t sentence = firstsentence-1;

             const unsigned char version = (in != NULL) ? getdataversion(in) : 2;


             bool iter_unigramsonly = false; //only needed for counting unigrams when we need them but they would be discarded

             bool skipunigrams = false; //will be set to true later only when MINTOKENS=1,MINLENGTH=1 to prevent double counting of unigrams

             if (( (options.MINLENGTH > 1) ||(options.MINTOKENS == 1)) && (options.MINTOKENS_UNIGRAMS > options.MINTOKENS)) {

                 iter_unigramsonly = true;

             }


             if (!options.QUIET) {

                 std::cerr << "Training patternmodel";

                 if (constrainbymodel != NULL) std::cerr << ", constrained by another model";

                 std::cerr << ", occurrence threshold: " << options.MINTOKENS;

                 if (iter_unigramsonly) std::cerr << ", secondary word occurrence threshold: " << options.MINTOKENS_UNIGRAMS;

                 if (version < 2) std::cerr << ", class encoding version: " << (int) version;

                 std::cerr << std::endl;

             }

             std::vector<std::pair<PatternPointer,int> > ngrams;

             std::vector<PatternPointer> subngrams;

             bool found;

             IndexReference ref;

             int prevsize = this->size();

             if (constrainbymodel == this) prevsize = 0; //going over same model

             int backoffn = 0;

             Pattern * linepattern = NULL;


             if (!this->data.empty()) {

                 if ((continued) && (!options.QUIET)) std::cerr << "Continuing training on preloaded model, computing statistics..." << std::endl;

                 this->computestats();

             }


             for (int n = 1; n <= options.MAXLENGTH; n++) {

                 bool skipgramsonly = false; //only used when continued==true, prevent double counting of n-grams whilst allowing skipgrams to be counted later

                 if (continued) {

                     if ((options.MINTOKENS > 1) && (constrainbymodel == NULL)) {

                        if (cache_grouptotal[NGRAM][n] > 0) {

                            if ((options.DOSKIPGRAMS_EXHAUSTIVE) && (cache_grouptotal[SKIPGRAM][n] == 0) ) {

                                skipgramsonly= true;

                            } else {

                                 if (!options.QUIET) std::cerr << "Skipping " << n << "-grams, already in model" << std::endl;

                                continue;

                            }

                        }

                     }

                 }

                 int foundngrams = 0;

                 int foundskipgrams = 0;

                 if (in != NULL) {

                     in->clear();

                     if (version >= 2) {

                         in->seekg(2);

                     } else {

                         in->seekg(0);

                     }

                 }

                 if (!options.QUIET) {

                     if (iter_unigramsonly) {

                         std::cerr << "Counting unigrams using secondary word occurrence threshold (" << options.MINTOKENS_UNIGRAMS << ")" << std::endl;

                     } else if (options.DOPATTERNPERLINE) {

                         std::cerr << "Counting patterns from list, one per line" << std::endl;

                     } else if (constrainbymodel != NULL) {

                         std::cerr << "Counting n-grams that occur in constraint model" << std::endl;

                     } else if (options.MINTOKENS > 1) {

                         std::cerr << "Counting " << n << "-grams" << std::endl;

                         if (skipgramsonly) std::cerr << "(only counting skipgrams actually, n-grams already counted earlier)" << std::endl;

                     } else {

                         std::cerr << "Counting *all* n-grams (occurrence threshold=1)" << std::endl;

                     }

                 }


                 if ((options.DOSKIPGRAMS_EXHAUSTIVE) && (gapmasks[n].empty())) gapmasks[n] = compute_skip_configurations(n, options.MAXSKIPS);


                 sentence = firstsentence-1; //reset

                 bool singlepass = false;

                 const unsigned int sentences = (reverseindex != NULL) ? reverseindex->sentences() : 0;

                 while (((reverseindex != NULL) && (sentence < sentences)) ||  ((reverseindex == NULL) && (in != NULL) && (!in->eof())))  {

                     sentence++;

                     //read line

                     if (linepattern != NULL) delete linepattern;

                     if (reverseindex == NULL) linepattern = new Pattern(in,false,version);

                     PatternPointer line = (reverseindex != NULL) ? reverseindex->getsentence(sentence) : PatternPointer(linepattern);

                     //if (in->eof()) break;

                     const unsigned int linesize = line.n();

                     if (options.DEBUG) std::cerr << "Processing line " << sentence << ", size (tokens) " << linesize << " (bytes) " << line.bytesize() << ", n=" << n <<  std::endl;

                     if (linesize == 0) {

                         //skip empty lines

                         continue;

                     }

                     //count total tokens

                     if ((n==1) && (!continued)) totaltokens += linesize;


                     ngrams.clear();

                     if (options.DOPATTERNPERLINE) {

                         if (linesize > (unsigned int) options.MAXLENGTH) continue;

                         ngrams.push_back(std::pair<PatternPointer,int>(line,0));

                     } else {

                         if (iter_unigramsonly) {

                             line.ngrams(ngrams, n);

                         } else if ((options.MINTOKENS > 1) && (constrainbymodel == NULL)) {

                             line.ngrams(ngrams, n);

                         } else {

                             singlepass = true;

                             int minlength = options.MINLENGTH;

                             if (continued) minlength = this->maxn + 1;

                             line.subngrams(ngrams,minlength,options.MAXLENGTH); //extract ALL ngrams if MINTOKENS == 1 or a constraint model is set, no need to look back anyway, only one iteration over corpus

                         }

                     }

                     if (options.DEBUG) std::cerr << "\t" << ngrams.size() << " ngrams in line" << std::endl;


                     // *** ITERATION OVER ALL NGRAMS OF CURRENT ORDER (n) IN THE LINE/SENTENCE ***

                     for (std::vector<std::pair<PatternPointer,int>>::iterator iter = ngrams.begin(); iter != ngrams.end(); iter++) {


                         try {

                             if ((singlepass) && (options.MINLENGTH == 1) && (skipunigrams) && (iter->first.n() == 1)) {

                                 //prevent double counting of unigrams after a iter_unigramsonly run with mintokens==1

                                 continue;

                             }


                             if (!skipgramsonly) {

                                 //check against constraint model

                                 if ((constrainbymodel != NULL) && (!iter_unigramsonly) && (!constrainbymodel->has(iter->first))) continue;


                                 found = true; //are the submatches in order? (default to true, attempt to falsify, needed for mintokens==1)


                                 //unigram check, special scenario, not usually processed!! (normal lookback suffices for most uses)

                                 if ((!iter_unigramsonly) && (options.MINTOKENS_UNIGRAMS > options.MINTOKENS) && ((n > 1) || (singlepass)) ) {

                                     subngrams.clear();

                                     iter->first.ngrams(subngrams,1); //get all unigrams

                                     for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {

                                         //check if unigram reaches threshold

                                         if (this->occurrencecount(*iter2) < (unsigned int) options.MINTOKENS_UNIGRAMS) {

                                             found = false;

                                             break;

                                         }

                                     }

                                 }


                                 //ngram (n-1) lookback

                                 if ((found) && (n > 1) && (options.MINTOKENS > 1) && (!options.DOPATTERNPERLINE) && (constrainbymodel == NULL)) {

                                     //check if sub-parts were counted

                                     subngrams.clear();

                                     backoffn = n - 1;

                                     if (backoffn > options.MAXBACKOFFLENGTH) backoffn = options.MAXBACKOFFLENGTH;

                                         iter->first.ngrams(subngrams, backoffn);

                                     for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {

                                         if (!this->has(*iter2)) {

                                             found = false;

                                             break;

                                         }

                                     }

                                 }


                                 ref = IndexReference(sentence, iter->second); //this is one token, we add the tokens as we find them, one by one

                                 if ((found) && (!skipgramsonly)) {

                                     if (options.DEBUG) std::cerr << "\t\tAdding @" << ref.sentence << ":" << ref.token << " n=" << iter->first.n() << " category=" <<(int) iter->first.category()<< std::endl;

                                     add(iter->first, ref);

                                 }

                             }

                             if (((n >= 3) || (options.MINTOKENS == 1)) //n is always 1 when mintokens == 1 !!

                                     && (options.DOSKIPGRAMS_EXHAUSTIVE)) {

                                 int foundskipgrams_thisround = this->computeskipgrams(iter->first, options, &ref, NULL, constrainbymodel, true );

                                 if (foundskipgrams_thisround > 0) hasskipgrams = true;

                                 foundskipgrams += foundskipgrams_thisround;

                             }

                         } catch (std::exception &e) {

                             std::cerr << "ERROR: An internal error has occured during training!!!" << std::endl;

                             if (ignoreerrors) continue;

                             throw InternalError();

                         }

                     }

                 }


                 if (!iter_unigramsonly) {

                     foundngrams = this->size() - foundskipgrams - prevsize;


                     if ((foundngrams) || (foundskipgrams)) {

                         if (n > this->maxn) this->maxn = n;

                         if (n < this->minn) this->minn = n;

                     } else {

                         if (!options.QUIET) std::cerr << "None found" << std::endl;

                         if (!continued) break;

                     }

                     if (!options.QUIET) std::cerr << " Found " << foundngrams << " ngrams...";

                     if (options.DOSKIPGRAMS_EXHAUSTIVE && !options.QUIET) std::cerr << foundskipgrams << " skipgram occurrences...";

                     if ((!continued) && ((constrainbymodel == NULL) or (constrainbymodel == this))) {

                         if ((options.MINTOKENS > 1) && (n == 1)) {

                             totaltypes = this->size(); //total unigrams, also those not in model

                         } else if ((options.MINTOKENS == 1) && (options.MINLENGTH == 1)) {

                             if (!options.QUIET) std::cerr << " computing total word types prior to pruning...";

                             totaltypes = totalwordtypesingroup(NGRAM,1);

                             if (!options.QUIET) std::cerr << totaltypes << "...";

                         }

                     }

                     unsigned int pruned;

                     if (singlepass) {

                         pruned = this->prune(options.MINTOKENS,0); //prune regardless of size

                     } else {

                         pruned = this->prune(options.MINTOKENS,n); //prune only in size-class

                         if ( (!options.DOSKIPGRAMS) && (!options.DOSKIPGRAMS_EXHAUSTIVE) &&  ( n - 1 >= 1) &&  ( (n - 1) < options.MINLENGTH) && (n - 1 != options.MAXBACKOFFLENGTH) &&

                             !( (n-1 == 1) && (options.MINTOKENS_UNIGRAMS > options.MINTOKENS)  ) //don't delete unigrams if we're gonna need them

                             ) {

                             //we don't need n-1 anymore now we're done with n, it

                             //is below our threshold, prune it all (== -1)

                             this->prune(-1, n-1);

                             if (!options.QUIET) std::cerr << " (pruned last iteration due to minimum length)" << pruned;

                         }

                     }

                     if (!options.QUIET) std::cerr << "pruned " << pruned;

                     if (foundskipgrams) {

                         unsigned int prunedextra;

                         if ((options.MINTOKENS == 1) || (constrainbymodel != NULL)) {

                             prunedextra = this->pruneskipgrams(options.MINTOKENS_SKIPGRAMS, options.MINSKIPTYPES, 0);

                         } else {

                             prunedextra = this->pruneskipgrams(options.MINTOKENS_SKIPGRAMS, options.MINSKIPTYPES, n);

                         }

                         if (prunedextra && !options.QUIET) std::cerr << " plus " << prunedextra << " extra skipgrams..";

                         pruned += prunedextra;

                     }

                     if (!options.QUIET) std::cerr << "...total kept: " << (foundngrams + foundskipgrams) - pruned << std::endl;

                     if (((options.MINTOKENS == 1) || (constrainbymodel != NULL))) break; //no need for further n iterations, we did all in one pass since there's no point in looking back

                 } else { //iter_unigramsonly

                     if (!options.QUIET) std::cerr <<  "found " << this->size() << std::endl;


                     if ((!continued) && ((constrainbymodel == NULL) or (constrainbymodel == this))) {

                         if (!options.QUIET) std::cerr << " computing total word types prior to pruning...";

                         totaltypes = this->size();

                         if (!options.QUIET) std::cerr << totaltypes << "...";

                     }

                     //prune the unigrams based on the word occurrence threshold

                     this->prune(options.MINTOKENS_UNIGRAMS,1);

                     //normal behaviour next round

                     iter_unigramsonly = false;

                     if ((n == 1) && (options.MINLENGTH ==1)) skipunigrams = true; //prevent double counting of unigrams

                     //decrease n so it will be the same (always 1) next (and by definition last) iteration

                     n--;

                 }

                 prevsize = this->size();

             }

             if (options.DOSKIPGRAMS && !options.DOSKIPGRAMS_EXHAUSTIVE) {

                 this->trainskipgrams(options, constrainbymodel);

             }

             if (options.MINTOKENS == 1) {

                 //needed to compute maxn, minn

                 this->postread(options);

             }

             if (options.MAXBACKOFFLENGTH < options.MINLENGTH) {

                 this->prune(-1, options.MAXBACKOFFLENGTH);

             }

             if ((options.MINLENGTH > 1) && (options.MINTOKENS_UNIGRAMS > options.MINTOKENS)) {

                 //prune the unigrams again

                 this->prune(-1,1);

             }

             if (options.PRUNENONSUBSUMED) {

                 if (!options.QUIET) std::cerr << "Pruning non-subsumed n-grams"  << std::endl;

                 int begin_n = options.PRUNENONSUBSUMED;

                 if ((begin_n > options.MAXLENGTH)) begin_n = options.MAXLENGTH;

                 for (int n = begin_n; n > 1; n--) {

                     std::unordered_set<Pattern> subsumed;

                     unsigned int prunednonsubsumed = 0;

                     PatternModel::iterator iter = this->begin();

                     while (iter != this->end()) {

                         const unsigned int pattern_n = iter->first.n();

                         if (pattern_n == (unsigned int) n) {

                             subngrams.clear();

                             iter->first.ngrams(subngrams, n-1);

                             for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) subsumed.insert(Pattern(*iter2));

                         }

                         iter++;

                     };

                     prunednonsubsumed += this->prunenotinset(subsumed, n-1);

                     if (!options.QUIET) std::cerr << " pruned " << prunednonsubsumed << " non-subsumed " << (n-1) << "-grams"  << std::endl;

                 }

             }

             this->posttrain(options);

             if (linepattern != NULL) delete linepattern;

         }


         virtual void train(const std::string & filename, PatternModelOptions options, PatternModelInterface * constrainbymodel = NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) {

             if ((filename.size() > 3) && (filename.substr(filename.size()-3) == ".bz2")) {

                 std::ifstream * in = new std::ifstream(filename.c_str(), std::ios::in|std::ios::binary);

                 bz2istream * decompressor = new bz2istream(in->rdbuf());

                 this->train( (std::istream*) decompressor, options, constrainbymodel, continued, firstsentence, ignoreerrors);

                 delete decompressor;

                 delete in;

             } else {

                 std::ifstream * in = new std::ifstream(filename.c_str());

                 this->train((std::istream*) in, options, constrainbymodel, continued, firstsentence, ignoreerrors);

                 in->close();

                 delete in;

             }

         }


         virtual int computeskipgrams(const PatternPointer & pattern, int mintokens = 2,  const IndexReference * singleref= NULL, const IndexedData * multiplerefs = NULL,  PatternModelInterface * constrainbymodel = NULL, std::vector<PatternPointer> * targetcontainer = NULL,  const bool exhaustive = false, const int maxskips = 3, const bool DEBUG = false) {


             //if targetcontainer is NULL, skipgrams will be added to the model,

             // if not null , they will be added to the targetcontainer instead


             if (mintokens == -1) mintokens = 2;;

             if (mintokens  <= 1) {

                 mintokens = 1;

             }

             //internal function for computing skipgrams for a single pattern

             int foundskipgrams = 0;

             const int n = pattern.n();

             std::vector<PatternPointer> subngrams;


             if (gapmasks[n].empty()) gapmasks[n] = compute_skip_configurations(n, maxskips);


             //loop over all possible gap configurations

             int gapconf_i = 0;

             for (std::vector<uint32_t>::iterator iter2 =  gapmasks[n].begin(); iter2 != gapmasks[n].end(); iter2++, gapconf_i++) {

                 if (*iter2 == 0) continue; //precaution (doesn't really happen anyway, but better safe than sorry)


                 //add skips

                 try {

                     PatternPointer skipgram = pattern;

                     skipgram.mask = *iter2;


                     if (DEBUG) {

                         std::cerr << "Checking for: " << std::endl;

                         skipgram.out();

                     }


                     if ((constrainbymodel != NULL) && (!constrainbymodel->has(skipgram))) continue;


                     if (DEBUG) {

                         if ((int) skipgram.n() != n) {

                             std::cerr << "Generated invalid skipgram, n=" << skipgram.n() << ", expected " << n << std::endl;

                             throw InternalError();

                         }

                     }


                     bool skipgram_valid = true;

                     if ((mintokens != 1) && (constrainbymodel == NULL)) {

                         bool check_extra = false;

                         //check if sub-parts were counted

                         subngrams.clear();

                         skipgram.ngrams(subngrams,n-1); //this also works for and returns skipgrams, despite the name

                         for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) { //only two patterns

                             const PatternPointer subpattern = *iter2;

                             if (!subpattern.isgap(0) && !subpattern.isgap(subpattern.n() - 1)) {

                                 //this subpattern is a valid

                                 //skipgram or ngram (no beginning or ending

                                 //gaps) that should occur

                                 if (DEBUG) {

                                     std::cerr << "Subpattern: " << std::endl;

                                     subpattern.out();

                                 }

                                 if (!this->has(subpattern)) {

                                     if (DEBUG) std::cerr << "  discarded" << std::endl;

                                     skipgram_valid = false;

                                     break;

                                 }

                             } else {

                                 //this check isn't enough, subpattern

                                 //starts or ends with gap

                                 //do additional checks

                                 check_extra = true;

                                 break;

                             }

                         }

                         if (!skipgram_valid) continue;


                         if (check_extra) {

                             if (exhaustive) { //the following is by definition the case in non-exhaustive mode, so we need only do it in exhaustive mode:


                                 //test whether parts occur in model, otherwise skip

                                 //can't occur either and we can discard it

                                 std::vector<PatternPointer> parts;

                                 skipgram.parts(parts);

                                 for (std::vector<PatternPointer>::iterator iter3 = parts.begin(); iter3 != parts.end(); iter3++) {

                                     const PatternPointer part = *iter3;

                                     if (!this->has(part)) {

                                         skipgram_valid = false;

                                         break;

                                     }

                                 }

                                 if (!skipgram_valid) continue;

                             }


                             //check whether the gaps with single token context (X * Y) occur in model,

                             //otherwise skipgram can't occur

                             const std::vector<std::pair<int,int>> gapconfiguration = mask2vector(skipgram.mask, n);

                             for (std::vector<std::pair<int,int>>::const_iterator iter3 = gapconfiguration.begin(); iter3 != gapconfiguration.end(); iter3++) {

                                 if (!((iter3->first - 1 == 0) && (iter3->first + iter3->second + 1 == n))) { //entire skipgram is already X * Y format

                                     const PatternPointer subskipgram = PatternPointer(skipgram, iter3->first - 1, iter3->second + 2);

                                     if (DEBUG) {

                                         std::cerr << "Subskipgram: " << std::endl;

                                         subskipgram.out();

                                     }

                                     if (!this->has(subskipgram)) {

                                         if (DEBUG) std::cerr << "  discarded" << std::endl;

                                         skipgram_valid = false;

                                         break;

                                     }

                                 }

                             }

                         }

                     }


                     if (skipgram_valid) {

                         if (DEBUG) std::cerr << "  counted!" << std::endl;

                         if (targetcontainer == NULL) {

                             //put in model

                             if  (!has(skipgram)) foundskipgrams++;

                             if (singleref != NULL) {

                                 add(skipgram, *singleref ); //counts the actual skipgram, will add it to the model

                             } else if (multiplerefs != NULL) {

                                 for (IndexedData::const_iterator refiter =  multiplerefs->begin(); refiter != multiplerefs->end(); refiter++) {

                                     const IndexReference ref = *refiter;

                                     add(skipgram, ref ); //counts the actual skipgram, will add it to the model

                                 }

                             } else {

                                 std::cerr << "ERROR: computeskipgrams() called with no singleref and no multiplerefs" << std::endl;

                                 throw InternalError();

                             }

                         } else {

                             //put in target container, may contain duplicates

                             foundskipgrams++;

                             targetcontainer->push_back(skipgram);

                         }


                     }


                 } catch (InternalError &e) {

                     std::cerr << "IGNORING ERROR and continuing with next skipgram" << std::endl;

                 }

             }

             return foundskipgrams;

         }


         virtual int computeskipgrams(const PatternPointer & pattern, PatternModelOptions & options ,  const IndexReference * singleref= NULL, const IndexedData * multiplerefs = NULL,  PatternModelInterface * constrainbymodel = NULL, const bool exhaustive = false) { //backward compatibility

             if (options.MINTOKENS_SKIPGRAMS < options.MINTOKENS) options.MINTOKENS_SKIPGRAMS = options.MINTOKENS;

             return computeskipgrams(pattern, options.MINTOKENS_SKIPGRAMS, singleref, multiplerefs, constrainbymodel, NULL, exhaustive, options.MAXSKIPS,options.DEBUG);

         }


         virtual std::vector<PatternPointer> findskipgrams(const PatternPointer & pattern, unsigned int occurrencethreshold = 1, int maxskips = 3) {

             //given the pattern, find all skipgrams in it that occur in the model


             std::vector<PatternPointer> skipgrams;

             this->computeskipgrams(pattern, occurrencethreshold, NULL, NULL, this->getinterface(), &skipgrams, false, maxskips);

             return skipgrams;

         }


         virtual void trainskipgrams(const PatternModelOptions options,  PatternModelInterface * constrainbymodel = NULL) {

             std::cerr << "Can not compute skipgrams on unindexed model (except exhaustively during train() )" << std::endl;

             throw InternalError();

         }


         //creates a new test model using the current model as training

         // i.e. only fragments existing in the training model are counted

         // remaining fragments are 'uncovered'

         void test(MapType & target, std::istream * in);


         void write(std::ostream * out) {

             const char null = 0;

             out->write( (char*) &null, sizeof(char));

             unsigned char t = this->getmodeltype();

             out->write( (char*) &t, sizeof(char));

             unsigned char v = this->getmodelversion();

             out->write( (char*) &v, sizeof(char));

             if ((this->getmodeltype()== UNINDEXEDPATTERNPOINTERMODEL) || (this->getmodeltype() == INDEXEDPATTERNPOINTERMODEL)) {

                 out->write( (char*) &this->corpussize, sizeof(unsigned int));

                 out->write((char*) this->corpusstart, sizeof(unsigned char) * this->corpussize);

             }

             out->write( (char*) &totaltokens, sizeof(uint64_t));

             const uint64_t tp = this->types(); //use this instead of totaltypes, as it may need to be computed on-the-fly still

             out->write( (char*) &tp, sizeof(uint64_t));

             MapType::write(out); //write PatternStore

         }


         void write(const std::string filename) {

             std::ofstream * out = new std::ofstream(filename.c_str());

             this->write(out);

             out->close();

             delete out;

         }


         typedef typename MapType::iterator iterator;

         typedef typename MapType::const_iterator const_iterator;


         virtual int maxlength() const { return this->maxn; };

         virtual int minlength() const { return this->minn; };

         virtual unsigned int occurrencecount(const Pattern & pattern)  {

             ValueType * data = this->getdata(pattern);

             if (data != NULL) {

                 return this->valuehandler.count(*data);

             } else {

                 return 0;

             }

         }


         virtual unsigned int occurrencecount(const PatternPointer & pattern)  {

             ValueType * data = this->getdata(pattern);

             if (data != NULL) {

                 return this->valuehandler.count(*data);

             } else {

                 return 0;

             }

         }


         virtual ValueType * getdata(const Pattern & pattern, bool makeifnew=false) {

             typename MapType::iterator iter = this->find(pattern);

             if (iter != this->end()) {

                 return &(iter->second);

             } else if (makeifnew) {

                 return &((*this)[pattern]);

             } else {

                 return NULL;

             }

         }


         virtual ValueType * getdata(const PatternPointer & pattern, bool makeifnew=false) {

             typename MapType::iterator iter = this->find(pattern);

             if (iter != this->end()) {

                 return &(iter->second);

             } else if (makeifnew) {

                 return &((*this)[pattern]);

             } else {

                 return NULL;

             }

         }


         virtual unsigned int types() {

             if ((totaltypes == 0) && (!this->data.empty())) totaltypes = this->totalwordtypesingroup(0, 0);

             return totaltypes;

         }


         virtual unsigned int tokens() const { return totaltokens; }


         unsigned char type() const { return model_type; }

         unsigned char version() const { return model_version; }


         void output(std::ostream *);


         unsigned int coveragecount(const Pattern &  key) {

            return this->occurrencecount(key) * key.size();

         }

         double coverage(const Pattern & key) {

             return this->coveragecount(key) / (double) this->tokens();

         }


         std::vector<PatternPointer> getreverseindex(const IndexReference ref, int occurrencecount = 0, int category = 0, unsigned int size = 0) {

             //Auxiliary function

             std::vector<PatternPointer> result;

             if (!this->reverseindex) return result;

             const unsigned int sl = this->reverseindex->sentencelength(ref.sentence);

             //std::cerr << "DEBUG: getreverseindex sentencelength(" << ref.sentence << ")=" << sl << std::endl;

             const unsigned int minn = this->minlength();

             const unsigned int maxn = this->maxlength();

             for (unsigned int n = minn; ref.token + n <= sl && n <= maxn; n++) {

                 if ((size == 0) || (n == size)) {

                     try {

                         //std::cerr << "DEBUG: getreverseindex getpattern " << ref.tostring() << " + " << n << std::endl;

                         const PatternPointer ngram = this->reverseindex->getpattern(ref,n);

                         /*std::cerr << "n: " << ngram.n() << std::endl;

                         std::cerr << "bytesize: " << ngram.bytesize() << std::endl;;

                         std::cerr << "hash: " << ngram.hash() << std::endl;*/

                         if ( (((occurrencecount == 0) && this->has(ngram)) || (this->occurrencecount(ngram) >= (unsigned int) occurrencecount))

                             && ((category == 0) || (ngram.category() >= category)) ) {

                             result.push_back(ngram);


                             if (((category == 0) || (category == SKIPGRAM)) && (this->hasskipgrams))  {


                                 //(we can't use gettemplates() because

                                 //gettemplates() depends on us, we have to

                                 //solve it low-level, punching holes:


                                 std::vector<PatternPointer> skipgrams = this->findskipgrams(ngram, occurrencecount);

                                 for (auto skipgram : skipgrams) {

                                     result.push_back(skipgram);

                                 }


                                 //TODO: flexgrams


                             }

                         }

                     } catch (KeyError &e) {

                         break;

                     }

                 }

             }

             return result;

         }


         /*std::vector<Pattern> getreverseindex_bysentence(int sentence) {

             //Auxiliary function

             std::vector<Pattern> result;

             for (int i = 0; i < this->reverseindex.sentencelength(sentence); i++) {

                 const IndexReference ref = IndexReference(sentence, i);

                 std::vector<Pattern> tmpresult =  this->getreverseindex(ref);

                 for (std::vector<Pattern>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {

                     const Pattern pattern = *iter;

                     result.push_back(pattern);

                 }

             }

             return result;

         }*/


         std::vector<std::pair<IndexReference,PatternPointer>> getreverseindex_bysentence(int sentence) {

             //Auxiliary function

             std::vector<std::pair<IndexReference,PatternPointer>> result;

             for (int i = 0; i < this->reverseindex->sentencelength(sentence); i++) {

                 const IndexReference ref = IndexReference(sentence, i);

                 std::vector<PatternPointer> tmpresult =  this->getreverseindex(ref);

                 for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {

                     const PatternPointer pattern = *iter;

                     result.push_back(std::pair<IndexReference,PatternPointer>(ref,pattern));

                 }

             }

             return result;

         }


         std::vector<std::pair<IndexReference,PatternPointer>> getreverseindex_right(const IndexReference ref) {

             //Auxiliary function

             std::vector<std::pair<IndexReference,PatternPointer>> result;

             for (int i = ref.token+1; i < this->reverseindex->sentencelength(ref.sentence); i++) {

                 const IndexReference ref2 = IndexReference(ref.sentence, i);

                 std::vector<PatternPointer> tmpresult =  this->getreverseindex(ref);

                 for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {

                     const PatternPointer pattern = *iter;

                     result.push_back(std::pair<IndexReference,PatternPointer>(ref2,pattern));

                 }

             }

             return result;

         }


         std::vector<std::pair<IndexReference,PatternPointer>> getreverseindex_left(const IndexReference ref) {

             //Auxiliary function

             std::vector<std::pair<IndexReference,PatternPointer>> result;

             for (int i = 0; i < ref.token; i++) {

                 const IndexReference ref2 = IndexReference(ref.sentence, i);

                 std::vector<PatternPointer> tmpresult =  this->getreverseindex(ref);

                 for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {

                     const PatternPointer pattern = *iter;

                     result.push_back(std::pair<IndexReference,PatternPointer>(ref2,pattern));

                 }

             }

             return result;

         }


         void computestats() {

             cache_categories.clear();

             cache_n.clear();

             cache_grouptotal.clear();

             cache_grouptotalpatterns.clear();

             cache_categories.insert(0);

             cache_n.insert(0);

             PatternModel::iterator iter = this->begin();

             while (iter != this->end()) {

                 const PatternType pattern = iter->first;

                 const int c = pattern.category();

                 cache_categories.insert(c);

                 const int n = pattern.n();

                 cache_n.insert(n);


                 //total of occurrences in a group, used for frequency computation

                 if (c != FLEXGRAM){

                     //no storage per N for dynamic skipgrams

                     cache_grouptotal[c][n] += this->valuehandler.count(iter->second);

                     cache_grouptotal[0][n] += this->valuehandler.count(iter->second);

                     cache_grouptotalpatterns[c][n]++;

                     cache_grouptotalpatterns[0][n]++;

                 }

                 cache_grouptotal[c][0] += this->valuehandler.count(iter->second);

                 cache_grouptotal[0][0] += this->valuehandler.count(iter->second);


                 //total of distinct patterns in a group

                 cache_grouptotalpatterns[c][0]++;

                 cache_grouptotalpatterns[0][0]++;

                 iter++;

             }

         }


         virtual void resetstats() {

             cache_grouptotalwordtypes.clear();

             cache_grouptotaltokens.clear();

         }


         virtual void computecoveragestats(int category = 0, int n = 0) {

             if ((cache_grouptotal.empty()) && (!this->data.empty())) this->computestats();

             //bool hasunigrams = false;


             //opting for memory over speed (more iterations, less memory)

             // Indexed model overloads this for better cache_grouptotaltokens computation!

             for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {

                 if ((category == 0) || (*iterc == category)) {

                  for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {

                   if (((n == 0) || (*itern == n)) && (cache_grouptotalwordtypes[*iterc][*itern] == 0) )  {

                     std::unordered_set<Pattern> types;

                     PatternModel::iterator iter = this->begin();

                     while (iter != this->end()) {

                         const PatternType pattern = iter->first;

                         const int pn = (int) pattern.n();

                         if ( (pn == 1) && (*itern <= 1) && ((*iterc == 0) || (pattern.category() == *iterc))) {

                             types.insert(pattern);

                         } else {

                             if (((*itern == 0) || (pn == *itern))  && ((*iterc == 0) || (pattern.category() == *iterc))) {

                                 std::vector<PatternType> unigrams;

                                 pattern.ngrams(unigrams, 1);

                                 for (typename std::vector<PatternType>::iterator iter2 = unigrams.begin(); iter2 != unigrams.end(); iter2++) {

                                     const PatternType p = *iter2;

                                     types.insert(p);

                                 }

                             }

                         }

                         cache_grouptotaltokens[*iterc][*itern] += this->valuehandler.count(iter->second);

                         iter++;

                     }

                     cache_grouptotalwordtypes[*iterc][*itern] += types.size();

                   }

                  }

                 }

             }


             /*

             if (!hasunigrams) {

                 for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {

                     int max = 0;

                     for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {

                         if cache_grouptotalwordtypes[*iterc][*itern]

                     }

                 }


             }*/

         }


         unsigned int totaloccurrencesingroup(int category, int n) {

             //category and n can be set to 0 to loop over all

             if ((cache_grouptotal.empty()) && (!this->data.empty())) this->computestats();

             return cache_grouptotal[category][n];

         }


         unsigned int totalpatternsingroup(int category, int n) {

             //category and n can be set to 0 to loop over all

             if ((cache_grouptotalpatterns.empty()) && (!this->data.empty())) this->computestats();

             return cache_grouptotalpatterns[category][n];

         }


         unsigned int totalwordtypesingroup(int category, int n) {

             //total covered word/unigram types

             //category and n can be set to 0 to loop over all

             if ((cache_grouptotalwordtypes.empty()) && (!this->data.empty())) this->computecoveragestats(category,n);

             return cache_grouptotalwordtypes[category][n];

         }

         unsigned int totaltokensingroup(int category, int n) {

             //total COVERED tokens

             //category and n can be set to 0 to loop over all

             if ((cache_grouptotaltokens.empty()) && (!this->data.empty())) this->computecoveragestats(category,n);

             return cache_grouptotaltokens[category][n];

         }


         double frequency(const Pattern & pattern) {

             //frequency within the same n and category class

             return this->occurrencecount(pattern) / (double) totaloccurrencesingroup(pattern.category(),pattern.n());

         }


         virtual void add(const PatternPointer & patternpointer, const IndexReference & ref) {

             const Pattern pattern = Pattern(patternpointer);

             /*if ((pattern.isskipgram()) || (pattern.isflexgram())) { //TODO: remove

                 std::cerr << "Adding skipgram!" << std::endl;

                 std::cerr << "pp.mask=" << patternpointer.mask << std::endl;

                 std::cerr << "pp.b=" << patternpointer.bytesize() << std::endl;

                 std::cerr << "p.b=" << pattern.bytesize() << std::endl;

                 patternpointer.out();

                 std::cerr << std::endl;

                 pattern.out();

                 throw InternalError();

             }*/

             ValueType * data = getdata(pattern, true);

             this->add(pattern, data, ref );

         }


         virtual void add(const Pattern & pattern, ValueType * value, const IndexReference & ref) {

             if (value == NULL) {

                 std::cerr << "Add() value is NULL!" << std::endl;

                 throw InternalError();

             }

             this->valuehandler.add(value, ref);

         }

         virtual void add(const PatternPointer & pattern, ValueType * value, const IndexReference & ref) {

             if (value == NULL) {

                 std::cerr << "Add() value is NULL!" << std::endl;

                 throw InternalError();

             }

             this->valuehandler.add(value, ref);

         }


         unsigned int prune(int threshold,int _n=0) {

             //prune all patterns under the specified threshold (set -1 for

             //all) and of the specified length (set _n==0 for all)

             unsigned int pruned = 0;

             PatternModel::iterator iter = this->begin();

             while (iter != this->end()) {

                 const PatternType pattern = iter->first;

                 if (( (_n == 0) || (pattern.n() == (unsigned int) _n) )&& ((threshold == -1) || (occurrencecount(pattern) < (unsigned int) threshold))) {

                     /*std::cerr << "preprune:" << this->size() << std::endl;

                     std::cerr << "DEBUG: pruning " << (int) pattern.category() << ",n=" << pattern.n() << ",skipcount=" << pattern.skipcount() << ",hash=" << pattern.hash() << std::endl;

                     std::cerr << occurrencecount(pattern) << std::endl;*/

                     iter = this->erase(iter);

                     //std::cerr << "postprune:" << this->size() << std::endl;

                     pruned++;

                 } else {

                     iter++;

                 }

             };


             return pruned;

         }


         virtual unsigned int pruneskipgrams(unsigned int threshold, int minskiptypes=2, int _n = 0) {

             //NOTE: minskiptypes is completely ignored! that only works for indexed models

             unsigned int pruned = 0;

             if (minskiptypes <=1) return pruned; //nothing to do


             typename PatternModel<ValueType,BaseValueHandler<ValueType>,MapType>::iterator iter = this->begin();

             while(iter != this->end()) {

                 const PatternType pattern = iter->first;

                 if (( (_n == 0) || ((int) pattern.n() == _n) ) && (pattern.category() == SKIPGRAM)) {

                     if (this->occurrencecount(pattern) < threshold) {

                         iter = this->erase(iter);

                         pruned++;

                         continue;

                     }

                 }

                 iter++;

             }

             return pruned;

         }


         unsigned int prunenotinset(const std::unordered_set<Pattern> & s, int _n) {

             unsigned int pruned = 0;

             if (s.empty()) {

                 return pruned;

             }

             PatternModel::iterator iter = this->begin();

             while (iter != this->end()) {

                 const PatternType pattern = iter->first;

                 if ( (_n == 0) || (pattern.n() == (unsigned int) _n) ) {

                     if (s.find(pattern) == s.end()) {

                         //not found in set

                         iter = this->erase(iter);

                         pruned++;

                         continue;

                     }

                 }

                 iter++;

             };


             return pruned;

         }


         template<class ValueType2,class ValueHandler2,class MapType2>

         unsigned int prunebymodel(PatternModel<ValueType2,ValueHandler2,MapType2> & secondmodel) {

             //is not used by default when working with constraint models

             //anymore, is directly processing during loading instead

             //

             //this is still useful if you have two models in memory though

             unsigned int pruned = 0;

             typename PatternModel<IndexedData,IndexedDataHandler,MapType>::iterator iter = this->begin();

             while(iter != this->end()) {

                 const PatternType pattern = iter->first;

                 if (!secondmodel.has(pattern)) {

                     iter = this->erase(iter);

                     pruned++;

                     continue;

                 }

                 iter++;

             }

             return pruned;

         }


         std::vector<std::pair<Pattern, int> > getpatterns(const Pattern & pattern) {

             //get all patterns in pattern

             std::vector<std::pair<Pattern, int> > v;

             std::vector<std::pair<Pattern, int> > ngrams;

             pattern.subngrams(ngrams, minlength(), maxlength());

             for (std::vector<std::pair<Pattern, int> >::iterator iter = ngrams.begin(); iter != ngrams.end(); iter++) {

                 const Pattern p = iter->first;

                 if (this->has(p)) v.push_back(*iter);


                 //TODO: match with skipgrams

             }

             return v;

         }


         virtual void print(std::ostream * out, ClassDecoder & decoder) {

             bool haveoutput = false;

             for (PatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {

                 if (!haveoutput) {

                     *out << "PATTERN\tCOUNT\tTOKENS\tCOVERAGE\tCATEGORY\tSIZE\tFREQUENCY" << std::endl;

                     haveoutput = true;

                 }

                 const PatternType pattern = iter->first;

                 this->print(out, decoder, pattern, true);

             }

             if (haveoutput) {

                 std::cerr << std::endl << "Legend:" << std::endl;

                 std::cerr << " - PATTERN    : The pattern, Gaps in skipgrams are represented as {*}. Variable-width gaps in flexgrams are shown using  {**}." << std::endl;

                 std::cerr << " - COUNT      : The occurrence count - the amount of times the pattern occurs in the data" << std::endl;

                 std::cerr << " - TOKENS     : The maximum number of tokens in the corpus that this pattern covers. *THIS IS JUST A MAXIMUM PROJECTION* rather than an exact number because your model is not indexed" << std::endl;

                 std::cerr << " - COVERAGE   : The maximum number of tokens covered, as a fraction of the total in the corpus (projection)" << std::endl;

                 std::cerr << " - CATEGORY   : The pattern type category (ngram,skipgram,flexgram)" << std::endl;

                 std::cerr << " - SIZE       : The size of the pattern (in tokens)" << std::endl;

                 std::cerr << " - FREQUENCY  : The frequency of the pattern *within it's pattern type category and size-class*." << std::endl;

                 std::cerr << " - REFERENCES : A space-delimited list of sentence:token position where the pattern occurs in the data. Sentences start at 1, tokens at 0" << std::endl;

             }

         }


         virtual void printreverseindex(std::ostream * out, ClassDecoder & decoder) {

             if (!this->reverseindex) return;

             for (IndexedCorpus::iterator iter = reverseindex->begin(); iter != reverseindex->end(); iter++) {

                 const IndexReference ref = iter->first;

                 std::vector<PatternPointer> rindex = this->getreverseindex(ref);

                 *out << ref.tostring();

                 for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                     const Pattern p = *iter2;

                     *out << "\t" << p.tostring(decoder);

                 }

                 *out << "\n";

             }

             *out << std::endl;

         }


         void printmodel(std::ostream * out, ClassDecoder & decoder) { //an alias because cython can't deal with a method named print

             this->print(out, decoder);

         }


         virtual void print(std::ostream* out, ClassDecoder &decoder, const PatternType & pattern, bool endline = true) {

             const std::string pattern_s = pattern.tostring(decoder);

             const unsigned int count = this->occurrencecount(pattern);

             const unsigned int covcount = this->coveragecount(pattern);

             const double coverage = covcount / (double) this->tokens();

             const double freq = this->frequency(pattern);

             const int cat = pattern.category();

             std::string cat_s;

             if (cat == 1) {

                 cat_s = "ngram";

             } else if (cat == 2) {

                 cat_s = "skipgram";

             } else if (cat == 3) {

                 cat_s = "flexgram";

             }

             *out << pattern_s << "\t" << count << "\t" << "\t" << covcount << "\t" << coverage << "\t" << cat_s << "\t" << pattern.size() << "\t" << freq;

             if (endline) *out << std::endl;

             //*out << pattern.hash() << "\t" << (size_t) pattern.data << std::endl;

         }


         void printpattern(std::ostream* out, ClassDecoder &decoder, const Pattern & pattern, bool endline = true) {  //another alias for cython who can't deal with methods named print

             return this->print(out,decoder,pattern,endline);

         }


         void histogram(std::map<unsigned int,unsigned int> & hist, unsigned int threshold = 0, unsigned int cap = 0, int category = 0, unsigned int size = 0) {

             for (PatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {

                 const PatternType pattern = iter->first;

                 if (((category != 0) && (pattern.category() != category)) || ((size != 0) && (size != pattern.size()))) continue;

                 unsigned int c = this->occurrencecount(pattern);

                 if (c >= threshold) hist[c]++;

             }

             if (cap > 0) {

                 unsigned int sum = 0;

                 std::map<unsigned int,unsigned int>::reverse_iterator iter = hist.rbegin();

                 while ((sum < cap) && (iter != hist.rend())) {

                     iter++;

                     sum += iter->second;

                 }

                 //delete everything else

                 hist.erase(iter.base(), hist.end());

             }

         }


         unsigned int topthreshold(int amount, int category=0, int size=0) {

             //compute occurrence threshold that holds the top $amount occurrences

             std::map<unsigned int,unsigned int> hist;

             histogram(hist, 0, amount, category, size);

             std::map<unsigned int,unsigned int>::reverse_iterator iter = hist.rbegin();

             if (iter != hist.rend()) {

                 return iter->first;

             } else {

                 return 0;

             }

         }


         void histogram(std::ostream * OUT, unsigned int threshold = 0, unsigned int cap = 0 , int category = 0, unsigned int size = 0) {

             std::map<unsigned int,unsigned int> hist;

             histogram(hist,threshold,cap,category,size);

             *OUT << "HISTOGRAM" << std::endl;

             *OUT << "------------------------------" << std::endl;

             *OUT << "OCCURRENCES\tPATTERNS" << std::endl;

             for (std::map<unsigned int,unsigned int>::iterator iter = hist.begin(); iter != hist.end(); iter++) {

                 *OUT << iter->first << "\t" << iter->second << std::endl;

             }

         }


         void info(std::ostream * OUT) {

             if (this->getmodeltype() == INDEXEDPATTERNMODEL) {

                 *OUT << "Type: indexed" << std::endl;

             } else if (this->getmodeltype() == UNINDEXEDPATTERNMODEL) {

                 *OUT << "Type: unindexed" << std::endl;

             } else {

                 //should never happen

                 *OUT << "Type: unknown" << std::endl;

             }

             *OUT << "Total tokens: " << this->totaltokens << std::endl;

             *OUT << "Total word types: " << this->totaltypes << std::endl;

             *OUT << "Types patterns loaded: " << this->size() << std::endl;

             *OUT << "Min n: " << this->minn << std::endl;

             *OUT << "Max n: " << this->maxn << std::endl;

             if (this->reverseindex)  {

                 *OUT << "Reverse index: yes" << std::endl;

                 *OUT << "References in reverse index: " << this->reverseindex->size() << std::endl;

             } else {

                 *OUT << "Reverse index: no" << std::endl;

             }

             *OUT << "Size of Pattern: " << sizeof(Pattern) << " byte" << std::endl;

             *OUT << "Size of ValueType: " << sizeof(ValueType) << " byte" << std::endl;

             unsigned int totalkeybs = 0;

             unsigned int totalvaluebs = 0;

             for (PatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {

                 const PatternType pattern = iter->first;

                 totalkeybs += sizeof(PatternType) + pattern.bytesize();

                 totalvaluebs += sizeof(ValueType);

             }

             *OUT << "Total key bytesize (patterns): " <<  totalkeybs << " bytes (" << (totalkeybs/1024/1024) << " MB)" << std::endl;

             *OUT << "Total value bytesize (counts/index): " <<  totalvaluebs << " bytes (" << (totalvaluebs/1024/1024) << " MB)" << std::endl;

             *OUT << "Mean key bytesize: " << (totalkeybs / (float) this->size()) << std::endl;

             *OUT << "Mean value bytesize: " << (totalvaluebs / (float) this->size()) << std::endl;


             unsigned int ri_totalkeybs = 0;

             unsigned int ri_totalvaluebs = 0;

             if (this->reverseindex) {

                 for (IndexedCorpus::iterator iter = this->reverseindex->begin(); iter != this->reverseindex->end(); iter++) {

                     ri_totalkeybs += sizeof(iter->first.sentence) + sizeof(iter->first.token);

                     ri_totalvaluebs += sizeof(IndexPattern); // sizeof(Pattern) + iter->pattern().bytesize();

                 }

                 *OUT << "Total key bytesize in reverse index (references): " <<  ri_totalkeybs << " bytes (" << (ri_totalkeybs/1024/1024) << " MB)" << std::endl;

                 *OUT << "Total value bytesize in reverse index (patterns): " <<  ri_totalvaluebs << " bytes (" << (ri_totalvaluebs/1024/1024) << " MB)" << std::endl;

             }


             const unsigned int t = (totalkeybs + totalvaluebs + ri_totalkeybs + ri_totalvaluebs);

             *OUT << "Total bytesize (without overhead): " << t << " bytes (" << (t/1024/1024) << " MB)" << std::endl;

         }


         void report(std::ostream * OUT) {

             if ((cache_grouptotaltokens.empty()) && (!this->data.empty())) {

                 std::cerr << "Computing statistics..." << std::endl;

                 this->computecoveragestats();

             }

             *OUT << std::setiosflags(std::ios::fixed) << std::setprecision(4) << std::endl;

             *OUT << "REPORT" << std::endl;

             if (this->getmodeltype() == UNINDEXEDPATTERNMODEL) {

                 *OUT << "   Warning: Model is unindexed, token coverage counts are mere maximal projections" << std::endl;

                 *OUT << "            assuming no overlap at all!!! Use an indexed model for accurate coverage counts" << std::endl;

             }

             *OUT << "----------------------------------" << std::endl;

             *OUT << "                          " << std::setw(15) << "PATTERNS" << std::setw(15) << "TOKENS" << std::setw(15) << "COVERAGE" << std::setw(15) << "TYPES" << std::setw(15) << std::endl;

             *OUT << "Total:                    " << std::setw(15) << "-" << std::setw(15) << this->tokens() << std::setw(15) << "-" << std::setw(15) << this->types() <<  std::endl;


             unsigned int coveredtypes = totalwordtypesingroup(0,0);  //will also work when no unigrams in model!

             unsigned int coveredtokens = totaltokensingroup(0,0);


             if (coveredtokens > this->tokens()) coveredtokens = this->tokens();

             unsigned int uncoveredtokens = this->tokens() - coveredtokens;

             if (uncoveredtokens < 0) uncoveredtokens = 0;

             *OUT << "Uncovered:                " << std::setw(15) << "-" << std::setw(15) << uncoveredtokens << std::setw(15) << uncoveredtokens / (double) this->tokens() << std::setw(15) << this->types() - coveredtypes <<  std::endl;

             *OUT << "Covered:                  " << std::setw(15) << this->size() << std::setw(15) << coveredtokens << std::setw(15) << coveredtokens / (double) this->tokens() <<  std::setw(15) << coveredtypes <<  std::endl << std::endl;


             bool haveoutput = false;

             for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {

                 const int c = *iterc;

                 if (cache_grouptotalpatterns.count(c))

                 for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {

                     const int n = *itern;

                     if (cache_grouptotalpatterns[c].count(n)) {

                         if (!haveoutput) {

                             //output headers

                             *OUT << std::setw(15) << "CATEGORY" << std::setw(15) << "N (SIZE) "<< std::setw(15) << "PATTERNS";

                             if (this->getmodeltype() != UNINDEXEDPATTERNMODEL) *OUT << std::setw(15) << "TOKENS" << std::setw(15) << "COVERAGE";

                             *OUT << std::setw(15) << "TYPES" << std::setw(15) << "OCCURRENCES" << std::endl;

                             haveoutput = true;

                         }

                         //category

                         if (c == 0) {

                             *OUT << std::setw(15) << "all";

                         } else if (c == NGRAM) {

                             *OUT << std::setw(15) << "n-gram";

                         } else if (c == SKIPGRAM) {

                             *OUT << std::setw(15) << "skipgram";

                         } else if (c == FLEXGRAM) {

                             *OUT << std::setw(15) << "flexgram";

                         }

                         //size

                         if (n == 0) {

                             *OUT << std::setw(15) << "all";

                         } else {

                             *OUT << std::setw(15) << n;

                         }

                         //patterns

                         *OUT << std::setw(15) << cache_grouptotalpatterns[c][n];

                         if (this->getmodeltype() != UNINDEXEDPATTERNMODEL) {

                             //tokens

                             *OUT << std::setw(15) << cache_grouptotaltokens[c][n];

                             //coverage

                             *OUT << std::setw(15) << cache_grouptotaltokens[c][n] / (double) this->tokens();

                         }

                         //types

                         *OUT << std::setw(15) << cache_grouptotalwordtypes[c][n];

                         //occurrences

                         *OUT << std::setw(15) << cache_grouptotal[c][n] << std::endl;;

                     }

                 }

             }


             if (haveoutput) {

                 std::cerr << std::endl << "Legend:" << std::endl;

                 std::cerr << " - PATTERNS    : The number of distinct patterns within the group" << std::endl;

                 if (this->getmodeltype() != UNINDEXEDPATTERNMODEL) {

                     std::cerr << " - TOKENS      : The number of tokens that is covered by the patterns in the group." << std::endl;

                     std::cerr << " - COVERAGE    : The number of tokens covered, as a fraction of the total in the corpus" << std::endl;

                 }

                 std::cerr << " - TYPES       : The number of unique *word/unigram* types in this group" << std::endl;

                 std::cerr << " - OCCURRENCES : The total number of occurrences of the patterns in this group" << std::endl;

             }

         }


         PatternSet<uint64_t> extractset(int minlength = 1, int maxlength = 1) {


             PatternSet<uint64_t> result;

             for (PatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {

                 const PatternType pattern = iter->first;

                 const int patternlength = pattern.n();

                 if ((patternlength >= minlength) && (patternlength <= maxlength)) {

                     result.insert(pattern);

                 } else if (patternlength > maxlength) {

                     std::vector<Pattern> subngrams;

                     pattern.subngrams(subngrams,minlength, maxlength);

                     for (std::vector<Pattern>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {

                         const Pattern pattern2 = *iter2;

                         result.insert(pattern2);

                     }

                 }

             }

             return result;

         }


         virtual void outputrelations(const Pattern & pattern, ClassDecoder & classdecoder, std::ostream * OUT) {} //does nothing for unindexed models

         virtual t_relationmap getsubchildren(const Pattern & pattern,int = 0, int = 0, int = 0) { return t_relationmap(); } //does nothing for unindexed models

         virtual t_relationmap getsubparents(const Pattern & pattern,int = 0, int = 0, int = 0) { return t_relationmap(); } //does nothing for unindexed models

         virtual t_relationmap gettemplates(const Pattern & pattern,int = 0) { return t_relationmap(); } //does nothing for unindexed models

         virtual t_relationmap getinstances(const Pattern & pattern,int = 0) { return t_relationmap(); } //does nothing for unindexed models

         virtual t_relationmap getskipcontent(const PatternPointer & pattern) { return t_relationmap(); } //does nothing for unindexed models

         virtual t_relationmap getleftneighbours(const Pattern & pattern,int = 0, int = 0,int = 0,int =0) { return t_relationmap(); } //does nothing for unindexed models

         virtual t_relationmap getrightneighbours(const Pattern & pattern,int = 0, int = 0,int = 0,int =0) { return t_relationmap(); } //does nothing for unindexed models

         virtual t_relationmap_double getnpmi(const Pattern & pattern, double threshold) { return t_relationmap_double(); } //does nothing for unindexed models

         virtual int computeflexgrams_fromskipgrams() { return 0; }//does nothing for unindexed models

         virtual int computeflexgrams_fromcooc() {return 0; }//does nothing for unindexed models

         virtual void outputcooc_npmi(std::ostream * OUT, ClassDecoder& classdecoder, double threshold) {}

         virtual void outputcooc(std::ostream * OUT, ClassDecoder& classdecoder, double threshold) {}

 };


 template<class MapType = PatternMap<IndexedData,IndexedDataHandler>,class PatternType = Pattern>

 class IndexedPatternModel: public PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType> {

     protected:

         virtual void postread(const PatternModelOptions options) {

             for (typename PatternModel<IndexedData,IndexedDataHandler,MapType>::iterator iter = this->begin(); iter != this->end(); iter++) {

                 const Pattern p = iter->first;

                 const int n = p.n();

                 if (n > this->maxn) this->maxn = n;

                 if (n < this->minn) this->minn = n;

                 if ((!this->hasskipgrams) && (p.isskipgram())) this->hasskipgrams = true;

             }

         }

         virtual void posttrain(const PatternModelOptions options) {

             if (!options.QUIET) std::cerr << "Sorting all indices..." << std::endl;

             for (typename PatternModel<IndexedData,IndexedDataHandler,MapType>::iterator iter = this->begin(); iter != this->end(); iter++) {

                 iter->second.sort();

             }


         }

    public:


     IndexedPatternModel<MapType,PatternType>(IndexedCorpus * corpus = NULL): PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>() {

         this->model_type = this->getmodeltype();

         this->model_version = this->getmodelversion();

         if (corpus) {

             this->reverseindex = corpus;

             this->attachcorpus(*corpus);

         } else {

             this->reverseindex = NULL;

         }

     }


     IndexedPatternModel<MapType,PatternType>(std::istream *f, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL):  PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>(){ //load from file

         this->model_type = this->getmodeltype();

         this->model_version = this->getmodelversion();

         if (corpus) {

             this->reverseindex = corpus;

             this->attachcorpus(*corpus);

         } else {

             this->reverseindex = NULL;

         }

         this->load(f,options, constrainmodel);

     }


     IndexedPatternModel<MapType,PatternType>(const std::string filename, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL): PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>() { //load from file

         this->model_type = this->getmodeltype();

         this->model_version = this->getmodelversion();

         if (corpus) {

             this->reverseindex = corpus;

             this->attachcorpus(*corpus);

         } else {

             this->reverseindex = NULL;

         }

         std::ifstream * in = new std::ifstream(filename.c_str());

         this->load( (std::istream *) in, options, constrainmodel);

         in->close();

         delete in;

     }


     virtual ~IndexedPatternModel<MapType,PatternType>() { }


     int getmodeltype() const { return INDEXEDPATTERNMODEL; }

     int getmodelversion() const { return 2;}


     virtual void add(const Pattern & pattern, IndexedData * value, const IndexReference & ref) {

         if (value == NULL) {

             value = getdata(pattern,true);

         }

         this->valuehandler.add(value, ref);

     }

     virtual void add(const PatternPointer & patternpointer, IndexedData * value, const IndexReference & ref) {

         if (value == NULL) {

             value = getdata(patternpointer,true);

         }

         this->valuehandler.add(value, ref);

     }


     IndexedData * getdata(const Pattern & pattern, bool makeifnew=false)  {

         typename MapType::iterator iter = this->find(pattern);

         if (iter != this->end()) {

             return &(iter->second);

         } else if (makeifnew) {

             return &((*this)[pattern]);

         } else {

             return NULL;

         }

     }


     IndexedData * getdata(const PatternPointer & pattern, bool makeifnew=false) {

         typename MapType::iterator iter = this->find(pattern);

         if (iter != this->end()) {

             return &(iter->second);

         } else if (makeifnew) {

             return &((*this)[pattern]);

         } else {

             return NULL;

         }

     }


     virtual void train(std::istream * in , PatternModelOptions options,  PatternModelInterface * constrainbymodel = NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) {

         if ((options.DOSKIPGRAMS) && (this->reverseindex == NULL)) {

             std::cerr << "ERROR: You must specify a reverse index if you want to train skipgrams (or train skipgrams exhaustively)" << std::endl;

             throw InternalError();

         }

         PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::train(in,options,constrainbymodel,continued,firstsentence,ignoreerrors);

     }


     virtual void train(const std::string & filename, PatternModelOptions options, PatternModelInterface * constrainbymodel = NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) {

         if ((options.DOSKIPGRAMS) && (this->reverseindex == NULL)) {

             std::cerr << "ERROR: You must specify a reverse index if you want to train skipgrams (or train skipgrams exhaustively)" << std::endl;

             throw InternalError();

         }

         PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::train(filename,options,constrainbymodel,continued,firstsentence,ignoreerrors);

     }


     void info(std::ostream * OUT) {

         if (this->getmodeltype() == INDEXEDPATTERNMODEL) {

             *OUT << "Type: indexed" << std::endl;

         } else if (this->getmodeltype() == UNINDEXEDPATTERNMODEL) {

             *OUT << "Type: unindexed" << std::endl;

         } else {

             //should never happen

             *OUT << "Type: unknown" << std::endl;

         }

         *OUT << "Total tokens: " << this->totaltokens << std::endl;

         *OUT << "Total word types: " << this->totaltypes << std::endl;

         *OUT << "Types patterns loaded: " << this->size() << std::endl;

         *OUT << "Min n: " << this->minn << std::endl;

         *OUT << "Max n: " << this->maxn << std::endl;

         if (this->reverseindex)  {

             *OUT << "Reverse index: yes" << std::endl;

             *OUT << "References in reverse index: " << this->reverseindex->size() << std::endl;

         } else {

             *OUT << "Reverse index: no" << std::endl;

         }

         *OUT << "Size of Pattern: " << sizeof(Pattern) << " byte" << std::endl;

         unsigned int totalkeybs = 0;

         unsigned int totalvaluebs = 0;

         unsigned int indexlengthsum = 0;

         for (typename IndexedPatternModel::iterator iter = this->begin(); iter != this->end(); iter++) {

             const Pattern pattern = iter->first;

             totalkeybs += sizeof(Pattern) + pattern.bytesize();

             totalvaluebs += iter->second.size() * sizeof(IndexReference); //sentence + token;

             indexlengthsum += iter->second.size();

         }

         *OUT << "Total key bytesize (patterns): " << totalkeybs << " bytes (" << (totalkeybs/1024/1024) << " MB)" << std::endl;

         *OUT << "Total value bytesize (counts/index): " << totalvaluebs << " bytes (" << (totalvaluebs/1024/1024) << " MB)" << std::endl;

         *OUT << "Mean key bytesize: " << (totalkeybs / (float) this->size()) << std::endl;

         *OUT << "Mean value bytesize: " << (totalvaluebs / (float) this->size()) << std::endl;

         *OUT << "Mean index length (ttr): " << (indexlengthsum / (float) this->size()) << std::endl;


         unsigned int ri_totalkeybs = 0;

         unsigned int ri_totalvaluebs = 0;

         if (this->reverseindex) {

             for (IndexedCorpus::iterator iter = this->reverseindex->begin(); iter != this->reverseindex->end(); iter++) {

                 ri_totalkeybs += sizeof(iter->first.sentence) + sizeof(iter->first.token);

                 ri_totalvaluebs += sizeof(IndexPattern); // sizeof(Pattern) + iter->pattern().bytesize();

             }

             *OUT << "Total key bytesize in reverse index (references): " << ri_totalkeybs << " bytes (" << (ri_totalkeybs/1024/1024) << " MB)" << std::endl;

             *OUT << "Total value bytesize in reverse index (patterns): " << ri_totalvaluebs << " bytes (" << (ri_totalvaluebs/1024/1024) << " MB)" << std::endl;

         }


         const unsigned int t = (totalkeybs + totalvaluebs + ri_totalkeybs + ri_totalvaluebs);

         *OUT << "Total bytesize (without overhead): " << t << " bytes (" << (t/1024/1024) << " MB)" << std::endl;

     }


     void print(std::ostream * out, ClassDecoder & decoder) {

         bool haveoutput = false;

         for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {

             if (!haveoutput) {

                 *out << "PATTERN\tCOUNT\tTOKENS\tCOVERAGE\tCATEGORY\tSIZE\tFREQUENCY\tREFERENCES" << std::endl;

                 haveoutput = true;

             }

             const PatternPointer pattern = iter->first;

             this->print(out, decoder, pattern, true);

         }

         if (haveoutput) {

             std::cerr << std::endl << "Legend:" << std::endl;

             std::cerr << " - PATTERN    : The pattern, Gaps in skipgrams are represented as {*}. Variable-width gaps in flexgrams are shown using {**}." << std::endl;

             std::cerr << " - COUNT      : The occurrence count - the amount of times the pattern occurs in the data" << std::endl;

             std::cerr << " - TOKENS     : The number of tokens in the corpus that this pattern covers" << std::endl;

             std::cerr << " - COVERAGE   : The number of tokens covered, as a fraction of the total in the corpus" << std::endl;

             std::cerr << " - CATEGORY   : The pattern type category (ngram,skipgram,flexgram)" << std::endl;

             std::cerr << " - SIZE       : The size of the pattern (in tokens)" << std::endl;

             std::cerr << " - FREQUENCY  : The frequency of the pattern *within it's pattern type category and size-class*." << std::endl;

             std::cerr << " - REFERENCES : A space-delimited list of sentence:token position where the pattern occurs in the data. Sentences start at 1, tokens at 0" << std::endl;

         }

     }


     void print(std::ostream* out, ClassDecoder &decoder, const PatternPointer & pattern, bool endline = true) {

             const std::string pattern_s = pattern.tostring(decoder);

             const unsigned int count = this->occurrencecount(pattern);

             const unsigned int covcount = this->coveragecount(pattern);

             const double coverage = covcount / (double) this->tokens();

             const double freq = this->frequency(pattern);

             const int cat = pattern.category();

             std::string cat_s;

             if (cat == 1) {

                 cat_s = "ngram";

             } else if (cat == 2) {

                 cat_s = "skipgram";

             } else if (cat == 3) {

                 cat_s = "flexgram";

             }

             *out << pattern_s << "\t" << count << "\t" << "\t" << covcount << "\t" << coverage << "\t" << cat_s << "\t" << pattern.size() << "\t" << freq << "\t";

             IndexedData * data = this->getdata(pattern);

             unsigned int i = 0;

             for (IndexedData::iterator iter2 = data->begin(); iter2 != data->end(); iter2++) {

                 i++;

                 *out << iter2->tostring();

                 if (i < count) *out << " ";

             }

             if (endline) *out << std::endl;

     }


     virtual void trainskipgrams(PatternModelOptions options,  PatternModelInterface * constrainbymodel = NULL) {

         if (options.MINTOKENS == -1) options.MINTOKENS = 2;

         this->cache_grouptotal.clear(); //forces recomputation of statistics

         for (int n = 3; n <= options.MAXLENGTH; n++) {

             if (this->gapmasks[n].empty()) this->gapmasks[n] = compute_skip_configurations(n, options.MAXSKIPS);

             if (!options.QUIET) std::cerr << "Counting " << n << "-skipgrams" << std::endl;

             int foundskipgrams = 0;

             for (typename MapType::iterator iter = this->begin(); iter != this->end(); iter++) {

                 const PatternPointer pattern = PatternPointer(&(iter->first));

                 const IndexedData multirefs = iter->second;

                 if (((int) pattern.n() == n) && (pattern.category() == NGRAM) ) foundskipgrams += this->computeskipgrams(pattern,options, NULL, &multirefs, constrainbymodel, false);

             }

             if (!foundskipgrams) {

                 std::cerr << " None found" << std::endl;

                 break;

             } else {

                 this->hasskipgrams = true;

             }

             if (!options.QUIET) std::cerr << " Found " << foundskipgrams << " skipgrams...";

             unsigned int pruned = this->prune(options.MINTOKENS,n);

             if (!options.QUIET) std::cerr << "pruned " << pruned;

             unsigned int prunedextra = this->pruneskipgrams(options.MINTOKENS_SKIPGRAMS, options.MINSKIPTYPES, n);

             if (prunedextra && !options.QUIET) std::cerr << " plus " << prunedextra << " extra skipgrams..";

             if (!options.QUIET) std::cerr << "...total kept: " <<  foundskipgrams - pruned - prunedextra << std::endl;

         }

     }


     Pattern getpatternfromtoken(IndexReference ref) {

         if (this->reverseindex == NULL) {

             std::cerr << "ERROR: getpatternfromtoken() No reverse index loaded" << std::endl;

             throw InternalError();

         }

         return this->reverseindex->getpattern(ref,1);

     }


     t_relationmap getskipcontent(const PatternPointer & pattern) {

         t_relationmap skipcontent; //will hold all skipcontent

         if (this->reverseindex == NULL) {

             std::cerr << "ERROR: No corpus data loaded! (in PatternModel::getskipcontent)" << std::endl;

             throw InternalError();

         }


         if (pattern.category() == SKIPGRAM) {

             const unsigned int n = pattern.n();

             const uint32_t skipcontent_mask = reversemask(pattern.mask,n );


             const int head = maskheadskip(skipcontent_mask, n); //skip at begin

             const int tail = masktailskip(skipcontent_mask,n); //skip at end


             const IndexedData * data = getdata(pattern);

             for (IndexedData::const_iterator iter2 = data->begin(); iter2 != data->end(); iter2++) {

                 const IndexReference ref = *iter2;


                 //raw skipcontent with leading and trailing skips

                 PatternPointer skipcontent_atref_raw = this->reverseindex->getpattern(ref,n);

                 skipcontent_atref_raw.mask = skipcontent_mask;


                 //trim leading and trailing skips

                 Pattern skipcontent_atref = PatternPointer(skipcontent_atref_raw, head, n-head-tail); //pattern from patternpointer

                 skipcontent[skipcontent_atref] += 1;

             }


         } else if (pattern.category() == FLEXGRAM) {

             //TODO: implement

         }

         //std::cerr << "Total found " << skipcontent.size() << std::endl;

         return skipcontent;

     }


     void prunerelations(t_relationmap & relations, unsigned int occurrencethreshold) {

         t_relationmap::iterator eraseiter;

         t_relationmap::iterator iter = relations.begin();

         while (iter != relations.end()) {

             if (iter->second < occurrencethreshold) {

                 eraseiter = iter;

                 iter++;

                 relations.erase(eraseiter);

             } else {

                 iter++;

             }

         }

     }


     t_relationmap gettemplates(const Pattern & pattern, unsigned int occurrencethreshold = 0) {

         //returns patterns that are an abstraction of the specified pattern

         //skipgrams

         if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {

             std::cerr << "ERROR: No reverse index present" << std::endl;

             throw InternalError();

         }


         IndexedData * data = this->getdata(pattern);

         if (data == NULL) {

             throw NoSuchPattern();

         }


         t_relationmap templates;


         const int _n = pattern.n();

         //search in forward index

         for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {

             const IndexReference ref = *iter;


             //search in reverse index

             std::vector<PatternPointer> rindex = this->getreverseindex(ref);

             for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                 const PatternPointer candidate = *iter2;


                 if (((int) candidate.n() == _n)  && (candidate != pattern) && (candidate.category() == SKIPGRAM)  && ((occurrencethreshold == 0) || (this->occurrencecount(pattern) >= occurrencethreshold)) ) {

                     templates[candidate] += 1;

                 }

             }

         }

         if (occurrencethreshold > 0) this->prunerelations(templates, occurrencethreshold);

         return templates;

     }


     t_relationmap getinstances(const Pattern & pattern, unsigned int occurrencethreshold = 0) {

         //returns patterns that instantiate the specified pattern

         if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {

             std::cerr << "ERROR: No reverse index present" << std::endl;

             throw InternalError();

         }


         IndexedData * data = this->getdata(pattern);

         if (data == NULL) {

             throw NoSuchPattern();

         }


         t_relationmap instances;


         const int _n = pattern.n();

         //search in forward index

         for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {

             const IndexReference ref = *iter;


             //search in reverse index

             std::vector<PatternPointer> rindex = this->getreverseindex(ref);

             for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                 const PatternPointer candidate = *iter2;


                 if (((int) candidate.n() == _n)  && (candidate != pattern) && (candidate.category() == NGRAM) && ((occurrencethreshold == 0) || (this->occurrencecount(pattern) >= occurrencethreshold))  ) {

                     instances[candidate] += 1;

                 }

             }

         }

         if (occurrencethreshold > 0) this->prunerelations(instances, occurrencethreshold);

         return instances;

     }


     t_relationmap getsubchildren(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0) {

         if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {

             std::cerr << "ERROR: No reverse index present" << std::endl;

             throw InternalError();

         }


         IndexedData * data = this->getdata(pattern);

         if (data == NULL) {

             throw NoSuchPattern();

         }


         t_relationmap subchildren;

         const int _n = pattern.n();

         const bool isskipgram = (pattern.category() == SKIPGRAM);

         //search in forward index

         for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {

             const IndexReference ref = *iter;

             for (int i = ref.token; i < ref.token + _n; i++) {

                 const IndexReference begin = IndexReference(ref.sentence,i);

                 int maxsubn = _n - (i - ref.token);


                 //std::cerr << "Begin " << begin.sentence << ":" << begin.token << ",<< std::endl;


                 //search in reverse index

                 std::vector<PatternPointer> rindex = this->getreverseindex(begin);

                 for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                     const PatternPointer candidate = *iter2;

                     //std::cerr << "Considering candidate @" << ref2.sentence << ":" << ref2.token << ", n=" << candidate.n() << ", bs=" << candidate.bytesize() <<  std::endl;

                     //candidate.out();

                     if (((int) candidate.n() <= maxsubn) && (candidate != pattern)

                         && ((occurrencethreshold == 0) || (this->occurrencecount(candidate) >= occurrencethreshold))

                         && ((category == 0) || (candidate.category() >= category))

                         && ((size == 0) || (candidate.n() >= size))

                         ) {

                         if ((isskipgram) || (candidate.category() == SKIPGRAM)) { //MAYBE TODO: I may check too much now... could be more efficient?

                             //candidate may not have skips in places where the larger pattern does

                             Pattern tmpl = Pattern(pattern, i, candidate.n()); //get the proper slice to match

                             if (candidate.instanceof(tmpl)) {

                                 subchildren[candidate] = subchildren[candidate] + 1;

                             }

                         } else if (candidate.category() == FLEXGRAM) {

                             //TODO

                         } else {

                             subchildren[candidate]++;

                         }

                     }

                 }

             }

         }

         if (occurrencethreshold > 0) this->prunerelations(subchildren, occurrencethreshold);

         return subchildren;

     }


     t_relationmap getsubparents(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0) {

         //returns patterns that subsume the specified pattern (i.e. larger

         //patterns)

         if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {

             std::cerr << "ERROR: No reverse index present" << std::endl;

             throw InternalError();

         }


         IndexedData * data = this->getdata(pattern);

         if (data == NULL) {

             throw NoSuchPattern();

         }


         t_relationmap subsumes;

         const int _n = pattern.n();

         //search in forward index

         for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {

             const IndexReference ref = *iter;


             //search in reverse index

             std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_bysentence(ref.sentence);

             for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                 if ((iter2->first.sentence != ref.sentence) || (iter2->first.token > ref.token)) break;

                 const PatternPointer candidate = iter2->second;


                 int minsubsize = _n + (ref.token - iter2->first.token);


                 if (((int) candidate.n() >= minsubsize)  && (candidate != pattern)

                         && ((occurrencethreshold == 0) || (this->occurrencecount(candidate) >= occurrencethreshold))

                         && ((category == 0) || (candidate.category() >= category))

                         && ((size == 0) || (candidate.n() >= size))

                     ) {

                     if ((candidate.category() == SKIPGRAM) || (pattern.category() == SKIPGRAM))  {//MAYBE TODO: I may check too much now... could be more efficient?

                         //instance may not have skips in places where the larger candidate pattern does

                         Pattern inst = Pattern(candidate, iter2->first.token, pattern.n()); //get the proper slice to match

                         if (pattern.instanceof(candidate)) {

                             subsumes[candidate] += 1;

                         }

                     } else if (candidate.category() == FLEXGRAM) {

                         //TODO

                     } else {

                         subsumes[candidate] += 1;

                     }

                 }

             }

         }

         if (occurrencethreshold > 0) this->prunerelations(subsumes, occurrencethreshold);

         return subsumes;

     }


     t_relationmap getleftneighbours(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0, unsigned int cutoff=0) {

         if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {

             std::cerr << "ERROR: No reverse index present" << std::endl;

             throw InternalError();

         }


         IndexedData * data = this->getdata(pattern);

         if (data == NULL) {

             throw NoSuchPattern();

         }


         t_relationmap neighbours;

         for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {

             const IndexReference ref = *iter;


             std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_bysentence(ref.sentence);

             for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                 const IndexReference ref2 = iter2->first;

                 const PatternPointer neighbour = iter2->second;

                 if ((ref2.token + neighbour.n() == ref.token)

                         && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))

                         && ((category == 0) || (neighbour.category() >= category))

                         && ((size == 0) || (neighbour.n() >= size))

                     ){

                     neighbours[neighbour]++;

                     if ((cutoff > 0) && (neighbours.size() >= cutoff)) break;

                 } else if ((ref2.token > ref.token) || (ref2.sentence > ref.sentence)) break;

             }

             if ((cutoff > 0) && (neighbours.size() >= cutoff)) break;

         }

         if (occurrencethreshold > 0) this->prunerelations(neighbours, occurrencethreshold);

         return neighbours;

     }


     t_relationmap getrightneighbours(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0, unsigned int cutoff=0) {

         if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {

             std::cerr << "ERROR: No reverse index present" << std::endl;

             throw InternalError();

         }


         IndexedData * data = this->getdata(pattern);

         if (data == NULL) {

             throw NoSuchPattern();

         }


         t_relationmap neighbours;

         for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {

             IndexReference ref = *iter;

             ref.token += pattern.size();


             //search in reverse index

             std::vector<PatternPointer> rindex = this->getreverseindex(ref);

             for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                 const PatternPointer neighbour = *iter2;

                 if ( ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))

                         && ((category == 0) || (neighbour.category() >= category))

                         && ((size == 0) || (neighbour.n() >= size)) ) {

                     neighbours[neighbour]++;

                     if ((cutoff > 0) && (neighbours.size() >= cutoff)) break;

                 }

             }

             if ((cutoff > 0) && (neighbours.size() >= cutoff)) break;

         }

         if (occurrencethreshold > 0) this->prunerelations(neighbours, occurrencethreshold);

         return neighbours;

     }


     int pruneskipgrams(int threshold, int minskiptypes, int _n = 0) {

         int pruned = 0;

         if (minskiptypes <=1) return pruned; //nothing to do


         typename PatternModel<IndexedData,IndexedDataHandler,MapType>::iterator iter = this->begin();

         while(iter != this->end()) {

             const PatternType pattern = iter->first;

             if (( (_n == 0) || ((int) pattern.n() == _n) ) && (pattern.category() == SKIPGRAM)) {

                 t_relationmap skipcontent = getskipcontent(pattern);

                 t_relationmap skipcontent2 = getskipcontent(pattern); //TODO: remove debug

                 if (skipcontent2.size() != skipcontent.size()) {

                     std::cerr << " Pattern " << pattern.hash() << " discrepancy!!! " << skipcontent.size() << " vs " << skipcontent2.size() << std::endl;

                     throw InternalError();

                 }

                 //std::cerr << " Pattern " << pattern.hash() << " occurs: " << this->occurrencecount(pattern) << " skipcontent=" << skipcontent.size() << std::endl;

                 if ((int) skipcontent.size() < minskiptypes) { //will take care of token threshold too, patterns not meeting the token threshold are not included

                     //std::cerr << "..pruning" << std::endl;

                     iter = this->erase(iter);

                     pruned++;

                     continue;

                 }

             }

             iter++;

         }

         return pruned;

     }


     virtual void computecoveragestats(int category = 0, int n = 0) {

         //opting for memory over speed (more iterations, less memory)

         //overloaded version for indexedmodel

         if ((this->cache_grouptotal.empty()) && (!this->data.empty())) this->computestats();


         if ((this->cache_n.size() == 1) && (*this->cache_n.begin() == 1) && (n <= 1)) {

             //special condition, only unigrams, we can be done quicker

             this->cache_grouptotalwordtypes[0][1] = this->size();

             typename PatternModel<IndexedData,IndexedDataHandler,MapType>::iterator iter = this->begin();

             while (iter != this->end()) {

                 this->cache_grouptotaltokens[0][1] += this->valuehandler.count(iter->second);

                 iter++;

             }

             return;

         }


         for (std::set<int>::iterator iterc = this->cache_categories.begin(); iterc != this->cache_categories.end(); iterc++) {

           if ((category == 0) || (*iterc == category)) {

             for (std::set<int>::iterator itern = this->cache_n.begin(); itern != this->cache_n.end(); itern++) {

               if (((n == 0) || (*itern == n)) && (this->cache_grouptotalwordtypes[*iterc][*itern] == 0) )  {

                 std::unordered_set<Pattern> types;

                 std::set<IndexReference> tokens;

                 typename PatternModel<IndexedData,IndexedDataHandler,MapType>::iterator iter = this->begin();

                 while (iter != this->end()) {

                     const Pattern pattern = iter->first;

                     const int n = pattern.n();

                     if ( (n == 1) && (*itern <= 1) && ((*iterc == 0) || (pattern.category() == *iterc))) {

                         types.insert(pattern);

                     } else {

                         if (((*itern == 0) || (n == *itern))  && ((*iterc == 0) || (pattern.category() == *iterc))) {

                             std::vector<Pattern> unigrams;

                             pattern.ngrams(unigrams, 1);

                             for (std::vector<Pattern>::iterator iter2 = unigrams.begin(); iter2 != unigrams.end(); iter2++) {

                                 const Pattern p = *iter2;

                                 types.insert(p);

                             }

                         }

                     }

                     IndexedData * data = this->getdata(pattern);

                     for (IndexedData::iterator dataiter = data->begin(); dataiter != data->end(); dataiter++) {

                         //take into account all tokens

                         for (unsigned int i = 0; i < pattern.n(); i++) {

                             tokens.insert(*dataiter + i);

                         }

                     }

                     iter++;

                 }

                 this->cache_grouptotalwordtypes[*iterc][*itern] += types.size();

                 this->cache_grouptotaltokens[*iterc][*itern] += tokens.size();

             }

           }

         }

       }

     }


     t_relationmap getrightcooc(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0,IndexedData * matches = NULL) {

         if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {

             std::cerr << "ERROR: No reverse index present" << std::endl;

             throw InternalError();

         }


         IndexedData * data = this->getdata(pattern);

         if (data == NULL) {

             throw NoSuchPattern();

         }


         const int _n = pattern.n();

         t_relationmap cooc;

         //find everything that co-occurs *without overlap* TO THE RIGHT

         for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {

             const IndexReference ref = *iter;


             std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_right(ref);

             for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                 const IndexReference ref2 = iter2->first;

                 const PatternPointer neighbour = iter2->second;

                 if ( (ref2.token > ref.token + _n)

                         && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))

                         && ((category == 0) || (neighbour.category() >= category))

                         && ((size == 0) || (neighbour.n() >= size))

                      ) {

                     cooc[neighbour]++;

                     if (matches != NULL) matches->insert(ref2);

                 }

             }

         }

         if (occurrencethreshold > 0) this->prunerelations(cooc, occurrencethreshold);

         return cooc;

     }


     t_relationmap getleftcooc(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0) {

         if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {

             std::cerr << "ERROR: No reverse index present" << std::endl;

             throw InternalError();

         }


         IndexedData * data = this->getdata(pattern);

         if (data == NULL) {

             throw NoSuchPattern();

         }


         t_relationmap cooc;

         //find everything that co-occurs *without overlap* TO THE LEFT

         for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {

             const IndexReference ref = *iter;


             std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_left(ref);

             for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                 const IndexReference ref2 = iter2->first;

                 const PatternPointer neighbour = iter2->second;

                 const int _n = neighbour.n();

                 if ( (ref2.token + _n < ref.token )

                         && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))

                         && ((category == 0) || (neighbour.category() >= category))

                         && ((size == 0) || (neighbour.n() >= size))

                     ) {

                     cooc[neighbour]++;

                 }

             }

         }

         if (occurrencethreshold > 0) this->prunerelations(cooc, occurrencethreshold);

         return cooc;

     }


     t_relationmap getcooc(const PatternPointer & pattern, unsigned int occurrencethreshold = 0, int category = 0, unsigned int size = 0,bool ordersignificant = false) {

         if ((this->reverseindex == NULL) || (this->reverseindex->empty())) {

             std::cerr << "ERROR: No reverse index present" << std::endl;

             throw InternalError();

         }


         IndexedData * data = this->getdata(pattern);

         if (data == NULL) {

             throw NoSuchPattern();

         }


         const int _n = pattern.n();

         t_relationmap cooc;

         //find everything that co-occurs *without overlap* TO THE RIGHT

         for (IndexedData::iterator iter = data->begin(); iter != data->end(); iter++) {

             const IndexReference ref = *iter;


             std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_bysentence(ref.sentence);

             for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {

                 const IndexReference ref2 = iter2->first;

                 const PatternPointer neighbour = iter2->second;

                 if ((ordersignificant) && (neighbour.pattern() < pattern)) continue;

                 const int _n2 = neighbour.n();

                 if ( ((ref2.token + _n2 < ref.token ) || (ref2.token > ref.token + _n))

                         && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))

                         && ((category == 0) || (neighbour.category() >= category))

                         && ((size == 0) || (neighbour.n() >= size))

                     ) {

                     cooc[neighbour]++;

                 }

             }

         }

         if (occurrencethreshold > 0) this->prunerelations(cooc, occurrencethreshold);

         return cooc;

     }


     double npmi(const PatternPointer & key1, const PatternPointer & key2, int jointcount) {

         //normalised pointwise mutual information

         return  log( (double) jointcount / (this->occurrencecount(key1) * this->occurrencecount(key2)) )  / -log((double)jointcount/(double)this->totaloccurrencesingroup(0,0) );

     }


     void outputrelations(const PatternPointer & pattern, t_relationmap & relations, ClassDecoder & classdecoder, std::ostream *OUT, const std::string label = "RELATED-TO") {

         int total = 0;

         for (t_relationmap::iterator iter = relations.begin(); iter != relations.end(); iter++) {

             total += iter->second;

         }

         if (total == 0) return;

         double total_f = total;

         const std::string pattern_s = pattern.tostring(classdecoder);

         for (t_relationmap::iterator iter = relations.begin(); iter != relations.end(); iter++) {

             const PatternPointer pattern2 = iter->first;

             *OUT << "\t" << pattern_s << "\t" << label << "\t" << pattern2.tostring(classdecoder) << "\t" << iter->second << "\t" << iter->second / total_f << "\t" << this->occurrencecount(pattern2) << std::endl;

         }

     }


     void outputrelations(const PatternPointer & pattern, ClassDecoder & classdecoder, std::ostream * OUT, bool outputheader=true) {

         if (outputheader) *OUT << "#\tPATTERN1\tRELATION\tPATTERN2\tREL.COUNT\tREL.FREQUENCY\tCOUNT2" << std::endl;

         {

             t_relationmap relations = this->getsubparents(pattern);

             this->outputrelations(pattern, relations, classdecoder, OUT, "SUBSUMED-BY");

         }

         {

             t_relationmap relations = this->getsubchildren(pattern);

             this->outputrelations(pattern, relations, classdecoder, OUT, "SUBSUMES");

         }

         {

             t_relationmap relations = this->getleftneighbours(pattern);

             this->outputrelations(pattern, relations, classdecoder, OUT, "RIGHT-NEIGHBOUR-OF");

         }

         {

             t_relationmap relations = this->getrightneighbours(pattern);

             this->outputrelations(pattern, relations, classdecoder, OUT, "LEFT-NEIGHBOUR-OF");

         }

         {

             t_relationmap relations = this->getrightcooc(pattern);

             this->outputrelations(pattern, relations, classdecoder, OUT, "LEFT-COOC-OF");

         }

         {

             t_relationmap relations = this->getleftcooc(pattern);

             this->outputrelations(pattern, relations, classdecoder, OUT, "RIGHT-COOC-OF");

         }

         if (pattern.category() == SKIPGRAM) {

             t_relationmap relations = this->getskipcontent(pattern);

             this->outputrelations(pattern, relations, classdecoder, OUT, "INSTANTIATED-BY");

         }

     }


     /*

      * Compute co-occurence as normalised pointwise mutual information for all patterns

      * @param coocmap The map that will store the results

      * @param threshold Only include pairs passing this NPMI threshold

      * @param right Compute co-occurence to the right  (default: true)

      * @param left Compute co-occurence to the left  (default: true)

      */

     void computenpmi( std::map<PatternPointer,t_relationmap_double> &  coocmap , double threshold, bool right=true, bool left=true) {

         //compute npmi co-occurrence for all patterns


         for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {

             const PatternType pattern = iter->first;

             t_relationmap tmp;

             if ((right)&&(!left)) {

                 tmp =  this->getrightcooc(pattern);

             } else if ((left)&&(!right)) {

                 tmp =  this->getleftcooc(pattern);

             } else if (left && right) { //order not relevant

                 tmp =  this->getcooc(pattern);

             }

             for (t_relationmap::iterator iter2 = tmp.begin(); iter2 != tmp.end(); iter2++) {

                 const PatternPointer pattern2 = iter2->first;

                 const double value = npmi(pattern,pattern2,iter2->second);

                 if (value >= threshold) coocmap[pattern][pattern2] = value;

             }

         }

     }


     void computecooc( std::map<PatternPointer,t_relationmap> &  coocmap , int threshold, bool right=true, bool left=true) {

         for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {

             const PatternType  pattern = iter->first;

             t_relationmap tmp;

             if ((right)&&(!left)) {

                 tmp =  this->getrightcooc(pattern, threshold);

             } else if ((left)&&(!right)) {

                 tmp =  this->getleftcooc(pattern, threshold);

             } else if (left && right) { //order not relevant

                 tmp =  this->getcooc(pattern, threshold);

             }

             for (t_relationmap::iterator iter2 = tmp.begin(); iter2 != tmp.end(); iter2++) {

                 const PatternPointer pattern2 = iter2->first;

                 const double value = iter2->second;

                 if (value >= threshold) coocmap[pattern][pattern2] = value;

             }

         }

     }


     int computeflexgrams_fromskipgrams() {

         this->cache_grouptotal.clear(); //forces recomputation of statistics

         int count = 0;

         for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {

             const PatternType pattern = iter->first;

             if (pattern.category() == SKIPGRAM) {

                 const PatternType flexgram = pattern.toflexgram();

                 if (!this->has(flexgram)) count++;

                 //copy data from pattern

                 IndexedData * data = this->getdata(pattern);

                 for (IndexedData::iterator iter2 = data->begin(); iter2 != data->end(); iter2++) {

                     const IndexReference ref = *iter2;

                     this->data[flexgram].insert(ref);

                 }

             }

         }

         return count;

     }


     int computeflexgrams_fromcooc(double threshold) { //TODO: won't work in pattern pointer model

         this->cache_grouptotal.clear(); //forces recomputation of statistics

         int found = 0;

         const unsigned char dynamicgap = 129;

         const Pattern dynamicpattern = Pattern(&dynamicgap,1);

         for (typename PatternModel<IndexedData,IndexedDataHandler,MapType,PatternType>::iterator iter = this->begin(); iter != this->end(); iter++) {

             const PatternType pattern = iter->first;

             IndexedData matches;

             t_relationmap tmp =  this->getrightcooc(pattern, 0,0,0, &matches);

             for (t_relationmap::iterator iter2 = tmp.begin(); iter2 != tmp.end(); iter2++) {

                 const PatternPointer pattern2 = iter2->first;

                 const double value = npmi(pattern,pattern2,iter2->second);

                 if (value >= threshold) {

                     const Pattern flexgram = pattern + dynamicpattern + pattern2;

                     if (!this->has(flexgram)) found++;

                     this->data[flexgram] = value;

                 }

             }

         }

         return found;

     }


     void outputcooc_npmi(std::ostream * OUT, ClassDecoder& classdecoder, double threshold) {

         std::map<PatternPointer,t_relationmap_double> npmimap;

         std::cerr << "Collecting patterns and computing NPMI..." << std::endl;

         computenpmi(npmimap, threshold);


         std::cerr << "Building inverse map..." << std::endl;

         //we want the reverse, so we can sort by co-occurrence

         std::multimap<double,std::pair<PatternPointer,PatternPointer>> inversemap;

         std::map<PatternPointer,t_relationmap_double>::iterator iter = npmimap.begin();

         while (iter != npmimap.end()) {

             for (t_relationmap_double::iterator iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++) {

                 inversemap.insert(std::pair<double,std::pair<PatternPointer,PatternPointer>>(iter2->second, std::pair<Pattern,Pattern>(iter->first, iter2->first)));

             }

             iter = npmimap.erase(iter);

         }


         *OUT << "Pattern1\tPattern2\tNPMI" << std::endl;

         for (std::multimap<double,std::pair<PatternPointer,PatternPointer>>::reverse_iterator iter2 = inversemap.rbegin(); iter2 != inversemap.rend(); iter2++) {

             const PatternPointer pattern1 = iter2->second.first;

             const PatternPointer pattern2 = iter2->second.second;

             *OUT << pattern1.tostring(classdecoder) << "\t" << pattern2.tostring(classdecoder) << "\t" << iter2->first << std::endl;

         }

     }


     void outputcooc(std::ostream * OUT, ClassDecoder& classdecoder, double threshold) {

         std::map<PatternPointer,t_relationmap> coocmap;

         std::cerr << "Collecting patterns and computing co-occurrence..." << std::endl;

         computecooc(coocmap, threshold);


         std::cerr << "Building inverse map..." << std::endl;

         //we want the reverse, so we can sort by co-occurrence

         std::multimap<uint32_t,std::pair<PatternPointer,PatternPointer>> inversemap;

         std::map<PatternPointer,t_relationmap>::iterator iter = coocmap.begin();

         while (iter != coocmap.end()) {

             for (t_relationmap::iterator iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++) {

                 inversemap.insert(std::pair<uint32_t,std::pair<PatternPointer,PatternPointer>>(iter2->second, std::pair<PatternPointer,PatternPointer>(iter->first, iter2->first)));

             }

             iter = coocmap.erase(iter);

         }


         *OUT << "Pattern1\tPattern2\tCooc" << std::endl;

         for (std::multimap<uint32_t,std::pair<PatternPointer,PatternPointer>>::reverse_iterator iter2 = inversemap.rbegin(); iter2 != inversemap.rend(); iter2++) {

             const Pattern pattern1 = iter2->second.first;

             const Pattern pattern2 = iter2->second.second;

             *OUT << pattern1.tostring(classdecoder) << "\t" << pattern2.tostring(classdecoder) << "\t" << iter2->first << std::endl;

         }

     }


     int flexgramsize(const Pattern & pattern, IndexReference begin) {


         if (pattern.category() != FLEXGRAM) return pattern.n();


         std::vector<Pattern> parts;

         int numberofparts = pattern.parts(parts);

         bool strictbegin = true;

         std::multimap<int, IndexReference> partmatches;

         int i = 0;

         std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->getreverseindex_right(begin); //TODO: Check

         for (std::vector<std::pair<IndexReference,PatternPointer>>::iterator iter = rindex.begin(); iter != rindex.end(); iter++) {

             const PatternPointer part = iter->second;

             IndexReference ref = iter->first;

             partmatches.insert(std::pair<int,IndexReference>(i, ref));

             i++;

         }


         int firsttoken = begin.token;

         IndexReference nextbegin = IndexReference(begin.sentence,999);

         for (int j = 0; j < numberofparts; j++) {

            //find a path

            int prevlevel = -1;

            bool found = false;

            for (std::multimap<int, IndexReference>::iterator iter = partmatches.lower_bound(j); iter != partmatches.upper_bound(j); iter++) {

                 found = true;

                 if (iter->first != prevlevel) {

                     begin = nextbegin;

                     nextbegin = IndexReference(begin.sentence,999); //reset

                 }

                 if (((iter->second == begin) || (begin < iter->second)) && (iter->second + parts[j].n() + 1 < nextbegin)) {

                     nextbegin = iter->second + parts[j].n() + 1;

                 }

                 prevlevel = iter->first;

            }

            if (!found) return 0;

         }

         return (nextbegin.token - firsttoken);

     }


 };


 template<class ValueType, class ValueHandler = BaseValueHandler<ValueType>, class MapType = PatternPointerMap<ValueType, BaseValueHandler<ValueType>>>

 class PatternPointerModel: public PatternModel<ValueType,ValueHandler,MapType,PatternPointer> {

     public:


         PatternPointerModel<ValueType,ValueHandler,MapType>(IndexedCorpus * corpus): PatternModel<ValueType,ValueHandler,MapType,PatternPointer>() {

             this->model_type = this->getmodeltype();

             this->model_version = this->getmodelversion();

             if (corpus) {

                 this->reverseindex = corpus;

                 this->attachcorpus(*corpus);

             } else {

                 this->reverseindex = NULL;

             }

         }


         PatternPointerModel<ValueType,ValueHandler,MapType>(std::istream *f, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL):  PatternModel<ValueType,ValueHandler,MapType,PatternPointer>(){ //load from file

             this->model_type = this->getmodeltype();

             this->model_version = this->getmodelversion();

             if (corpus) {

                 this->reverseindex = corpus;

                 this->attachcorpus(*corpus);

             } else {

                 this->reverseindex = NULL;

             }

             this->load(f,options, constrainmodel);

         }


         PatternPointerModel<ValueType,ValueHandler,MapType>(const std::string filename, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL): PatternModel<ValueType,ValueHandler,MapType,PatternPointer>() { //load from file

             this->model_type = this->getmodeltype();

             this->model_version = this->getmodelversion();

             if (corpus) {

                 this->reverseindex = corpus;

                 this->attachcorpus(*corpus);

             } else {

                 this->reverseindex = NULL;

             }

             std::ifstream * in = new std::ifstream(filename.c_str());

             this->load( (std::istream *) in, options, constrainmodel);

             in->close();

             delete in;

         }


         int getmodeltype() const { return UNINDEXEDPATTERNPOINTERMODEL; }

         int getmodelversion() const { return 2;}


         virtual void add(const PatternPointer & patternpointer, const IndexReference & ref) {

             if ((patternpointer.data < this->reverseindex->beginpointer()) || (patternpointer.data > this->reverseindex->beginpointer() + this->reverseindex->bytesize())) {

                 std::cerr << "Pattern Pointer points outside contained corpus data..." << std::endl;

                 throw InternalError();

             }

             ValueType * data = this->getdata(patternpointer, true);

             /*std::cerr << "Adding: n="<< patternpointer.n() << ",b=" << patternpointer.bytesize() << ",hash="<<patternpointer.hash()<<", value=" << *data << ",valuetype="<< (size_t) data << ",mask=" << patternpointer.mask << ",pattern=";

             patternpointer.out();

             std::cerr << std::endl;*/

             this->add(patternpointer, data, ref );

             /*std::cerr << "  Hash recheck: " << patternpointer.hash() << std::endl;

             std::cerr << "  Pattern hash recheck: " << Pattern(patternpointer).hash() << std::endl;

             std::cerr << "  Equivalence with Pattern: " << (int) (patternpointer == Pattern(patternpointer)) << std::endl;

             std::cerr << "  Equivalence with Pattern 2: " << (int) (Pattern(patternpointer) == patternpointer) << std::endl;

             std::cerr << "  New value verification: " << this->occurrencecount(patternpointer) << " == " << *data << std::endl;

             ValueType * data2 = this->getdata(patternpointer, true);

             std::cerr << "  New value verification (2): " << *data << " == " << *data2 << std::endl;

             std::cerr << "  New value verification (pointer): " << (size_t) data << " == " << (size_t) data2 << std::endl;*/

         }


         virtual void add(const PatternPointer & pattern, ValueType * value, const IndexReference & ref) {

             if (value == NULL) {

                 std::cerr << "Add() value is NULL!" << std::endl;

                 throw InternalError();

             }

             this->valuehandler.add(value, ref);

         }


 };


 template<class MapType=PatternPointerMap<IndexedData, IndexedDataHandler>>

 class IndexedPatternPointerModel: public IndexedPatternModel<MapType,PatternPointer> {

     public:


         IndexedPatternPointerModel<MapType>(IndexedCorpus * corpus): IndexedPatternModel<MapType,PatternPointer>() {

             this->model_type = this->getmodeltype();

             this->model_version = this->getmodelversion();

             if (corpus) {

                 this->reverseindex = corpus;

                 this->attachcorpus(*corpus);

             } else {

                 this->reverseindex = NULL;

             }

         }


         IndexedPatternPointerModel<MapType>(std::istream *f, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL): IndexedPatternModel<MapType,PatternPointer>(){ //load from file

             this->model_type = this->getmodeltype();

             this->model_version = this->getmodelversion();

             if (corpus) {

                 this->reverseindex = corpus;

                 this->attachcorpus(*corpus);

             } else {

                 this->reverseindex = NULL;

             }

             this->load(f,options, constrainmodel);

         }


         IndexedPatternPointerModel<MapType>(const std::string filename, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL, IndexedCorpus * corpus = NULL): IndexedPatternModel<MapType,PatternPointer>() { //load from file

             this->model_type = this->getmodeltype();

             this->model_version = this->getmodelversion();

             if (corpus) {

                 this->reverseindex = corpus;

                 this->attachcorpus(*corpus);

             } else {

                 this->reverseindex = NULL;

             }

             std::ifstream * in = new std::ifstream(filename.c_str());

             this->load( (std::istream *) in, options, constrainmodel);

             in->close();

             delete in;

         }


         int getmodeltype() const { return INDEXEDPATTERNPOINTERMODEL; }

         int getmodelversion() const { return 2;}


         void add(const PatternPointer & patternpointer, const IndexReference & ref) {

             if ((patternpointer.data < this->reverseindex->beginpointer()) || (patternpointer.data > this->reverseindex->beginpointer() + this->reverseindex->bytesize())) {

                 std::cerr << "Pattern Pointer points outside contained corpus data..." << std::endl;

                 throw InternalError();

             }

             IndexedData * data = this->getdata(patternpointer, true);

             this->add(patternpointer, data, ref );

         }


         void add(const PatternPointer & patternpointer, IndexedData * value, const IndexReference & ref) {

             if (value == NULL) {

                 value = this->getdata(patternpointer,true);

             }

             this->valuehandler.add(value, ref);

         }

 };


 double comparemodels_loglikelihood(const Pattern pattern, std::vector<PatternModel<uint32_t>* > & models);

 void comparemodels_loglikelihood(std::vector<PatternModel<uint32_t>* > & models, PatternMap<double> * resultmap, bool conjunctiononly = false, std::ostream * output = NULL, ClassDecoder * classdecoder = NULL );


 #endif

IndexedPatternModel::outputcooc
void outputcooc(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:3271

PatternPointer::out
bool out() const
Definition: pattern.cpp:345

PatternModelInterface::minlength
virtual int minlength() const  =0

PatternSetModel::write
void write(std::ostream *out)
Definition: patternmodel.h:438

IndexedPatternModel::print
void print(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:2398

PatternModel::version
unsigned char version() const
Definition: patternmodel.h:1379

PatternModel::minn
int minn
Definition: patternmodel.h:534

PatternModel::report
void report(std::ostream *OUT)
Definition: patternmodel.h:2056

PatternModel::gettemplates
virtual t_relationmap gettemplates(const Pattern &pattern, int=0)
Definition: patternmodel.h:2171

PatternModel::getskipcontent
virtual t_relationmap getskipcontent(const PatternPointer &pattern)
Definition: patternmodel.h:2173

maskheadskip
int maskheadskip(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:68

PatternModel::getnpmi
virtual t_relationmap_double getnpmi(const Pattern &pattern, double threshold)
Definition: patternmodel.h:2176

IndexedPatternModel::posttrain
virtual void posttrain(const PatternModelOptions options)
Definition: patternmodel.h:2203

PatternModel::printreverseindex
virtual void printreverseindex(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1883

PatternSetModel::load
virtual void load(std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:377

PatternSetModel::minlength
virtual int minlength() const
Definition: patternmodel.h:492

PatternModel::type
unsigned char type() const
Definition: patternmodel.h:1378

PATTERNPOINTER
Definition: pattern.h:61

NONE
Definition: patternmodel.h:86

PatternModel::getinstances
virtual t_relationmap getinstances(const Pattern &pattern, int=0)
Definition: patternmodel.h:2172

PatternModelOptions::MINSKIPTYPES
int MINSKIPTYPES
Minimum required amount of distinct patterns that can fit in a gap of a skipgram for the skipgram to ...
Definition: patternmodel.h:136

PatternSet::size
size_t size() const
Definition: patternstore.h:597

PatternPointer::size
const size_t size() const
Definition: pattern.h:436

PatternPointer::ngrams
int ngrams(std::vector< PatternPointer > &container, const int n) const
Definition: pattern.cpp:1072

PatternModelOptions::MAXLENGTH
int MAXLENGTH
The maximum length of patterns to be loaded/extracted, inclusive (in words/tokens) (default: 100) ...
Definition: patternmodel.h:126

IndexedPatternModel::getdata
IndexedData * getdata(const Pattern &pattern, bool makeifnew=false)
Definition: patternmodel.h:2299

IndexedPatternModel::getpatternfromtoken
Pattern getpatternfromtoken(IndexReference ref)
Definition: patternmodel.h:2484

PatternModel::totaltokensingroup
unsigned int totaltokensingroup(int category, int n)
Definition: patternmodel.h:1656

PatternModel::printpattern
void printpattern(std::ostream *out, ClassDecoder &decoder, const Pattern &pattern, bool endline=true)
Definition: patternmodel.h:1935

PatternMap::erase
bool erase(const Pattern &pattern)
Definition: patternstore.h:821

PatternModel::reverseindex
IndexedCorpus * reverseindex
Pointer to the reverse index and corpus data for this model (or NULL)
Definition: patternmodel.h:563

PatternModel::getreverseindex
std::vector< PatternPointer > getreverseindex(const IndexReference ref, int occurrencecount=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1408

PatternFeatureVectorMapHandler
Definition: datatypes.h:477

IndexedPatternModel::getdata
IndexedData * getdata(const PatternPointer &pattern, bool makeifnew=false)
Definition: patternmodel.h:2310

PatternModel::totalpatternsingroup
unsigned int totalpatternsingroup(int category, int n)
Definition: patternmodel.h:1634

PatternModel::maxlength
virtual int maxlength() const
Definition: patternmodel.h:1312

PatternModelOptions::DOREVERSEINDEX
bool DOREVERSEINDEX
Obsolete now, only here for backward-compatibility with v1.
Definition: patternmodel.h:139

PatternModel::add
virtual void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:1680

PatternSetModel::has
virtual bool has(const Pattern &pattern) const
Definition: patternmodel.h:364

t_relationmap_iterator
PatternMap< uint32_t, BaseValueHandler< uint32_t >, uint64_t >::iterator t_relationmap_iterator
Definition: patternmodel.h:232

PatternSetModel::load
virtual void load(std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:394

PatternModelInterface::frequency
virtual double frequency(const Pattern &)=0

IndexedCorpus::empty
bool empty() const
Definition: patternstore.h:270

PatternSetModel::type
unsigned char type() const
Definition: patternmodel.h:511

UNINDEXEDPATTERNMODEL
Definition: patternmodel.h:73

Pattern::tostring
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:278

PatternModel::load
virtual void load(std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:682

IndexedPatternModel::getsubchildren
t_relationmap getsubchildren(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2642

PatternModel::getsubchildren
virtual t_relationmap getsubchildren(const Pattern &pattern, int=0, int=0, int=0)
Definition: patternmodel.h:2169

PatternModel::add
virtual void add(const PatternPointer &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:1711

Pattern::isskipgram
const bool isskipgram() const
Definition: pattern.h:170

PatternModel::test
void test(MapType &target, std::istream *in)

PatternModel::printmodel
void printmodel(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1902

IndexedPatternModel::outputcooc_npmi
void outputcooc_npmi(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:3243

IndexedPatternModel::computecooc
void computecooc(std::map< PatternPointer, t_relationmap > &coocmap, int threshold, bool right=true, bool left=true)
Definition: patternmodel.h:3169

PatternModelOptions::MINTOKENS
int MINTOKENS
Definition: patternmodel.h:113

IndexedPatternModel::getmodelversion
int getmodelversion() const
Definition: patternmodel.h:2272

PatternPointer
Definition: pattern.h:357

PatternPointer::instanceof
bool instanceof(const Pattern &skipgram) const
Definition: pattern.cpp:1533

IndexedPatternModel::getleftcooc
t_relationmap getleftcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2983

IndexedPatternModel::getskipcontent
t_relationmap getskipcontent(const PatternPointer &pattern)
Definition: patternmodel.h:2499

IndexedData::iterator
std::vector< IndexReference >::iterator iterator
Definition: datatypes.h:109

NGRAM
Definition: pattern.h:54

PatternModelOptions::DOPATTERNPERLINE
bool DOPATTERNPERLINE
Assume each line contains one integral pattern, rather than actively extracting all subpatterns on a ...
Definition: patternmodel.h:140

patternstore.h
Contains lower-level containers for patterns.

comparemodels_loglikelihood
double comparemodels_loglikelihood(const Pattern pattern, std::vector< PatternModel< uint32_t > * > &models)
Definition: patternmodel.cpp:23

PatternModel::computecoveragestats
virtual void computecoveragestats(int category=0, int n=0)
Definition: patternmodel.h:1569

PatternModel::getdata
virtual ValueType * getdata(const Pattern &pattern, bool makeifnew=false)
Definition: patternmodel.h:1343

PatternModelOptions::DOREMOVESKIPGRAMS
bool DOREMOVESKIPGRAMS
Remove skip-grams from the model upon loading it.
Definition: patternmodel.h:146

Pattern
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75

IndexedPatternModel::postread
virtual void postread(const PatternModelOptions options)
Definition: patternmodel.h:2194

IndexedCorpus::sentences
unsigned int sentences() const
Definition: patternstore.h:150

PatternModel::train
virtual void train(const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:1081

IndexedCorpus::iterator
Definition: patternstore.h:156

PatternSetModel
A pattern model based on an unordered set, does not hold data, only patterns. Very suitable for loadi...
Definition: patternmodel.h:299

IndexedCorpus::getpattern
PatternPointer getpattern(const IndexReference &begin, int length=1) const
Definition: pattern.cpp:1764

IndexedPatternPointerModel::getmodeltype
int getmodeltype() const
Definition: patternmodel.h:3497

PatternMap::end
iterator end()
Definition: patternstore.h:813

PatternSetModel::getmodelversion
virtual int getmodelversion() const
Definition: patternmodel.h:359

ModelType
ModelType
Definition: patternmodel.h:72

Pattern::bytesize
const size_t bytesize() const
Definition: pattern.cpp:57

IndexedPatternModel::getcooc
t_relationmap getcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, bool ordersignificant=false)
Definition: patternmodel.h:3026

IndexedPatternModel::getsubparents
t_relationmap getsubparents(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2705

PatternModel::trainskipgrams
virtual void trainskipgrams(const PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL)
Definition: patternmodel.h:1266

IndexedPatternModel::npmi
double npmi(const PatternPointer &key1, const PatternPointer &key2, int jointcount)
Definition: patternmodel.h:3067

PatternModel::getmodeltype
virtual int getmodeltype() const
Definition: patternmodel.h:653

IndexedPatternModel::getinstances
t_relationmap getinstances(const Pattern &pattern, unsigned int occurrencethreshold=0)
Definition: patternmodel.h:2601

PatternModel
A model mapping patterns to values, gigh-level interface.
Definition: patternmodel.h:526

PatternPointer::n
const size_t n() const
Definition: pattern.cpp:93

IndexedPatternModel::computeflexgrams_fromskipgrams
int computeflexgrams_fromskipgrams()
Definition: patternmodel.h:3192

PatternPointerModel::getmodeltype
int getmodeltype() const
Definition: patternmodel.h:3398

PatternMap::iterator
std::unordered_map< Pattern, ValueType >::iterator iterator
Definition: patternstore.h:807

PatternModelOptions::DORESET
bool DORESET
sets all counts to zero upon loading, clears indices
Definition: patternmodel.h:148

PatternModel::totaltokens
uint64_t totaltokens
Total number of tokens in the original corpus, so INCLUDES TOKENS NOT COVERED BY THE MODEL! ...
Definition: patternmodel.h:530

PatternModelInterface::maxlength
virtual int maxlength() const  =0

PatternModelOptions::DOSKIPGRAMS_EXHAUSTIVE
bool DOSKIPGRAMS_EXHAUSTIVE
Load/extract skipgrams in an exhaustive fashion? More memory intensive, but the only options for unin...
Definition: patternmodel.h:135

PatternModel::output
void output(std::ostream *)

mask2vector
vector< pair< int, int > > mask2vector(const uint32_t mask, const int n)
Definition: algorithms.cpp:35

PatternModelInterface
Basic read-only interface for pattern models, abstract base class.
Definition: interface.h:39

PatternModel::getleftneighbours
virtual t_relationmap getleftneighbours(const Pattern &pattern, int=0, int=0, int=0, int=0)
Definition: patternmodel.h:2174

IndexedPatternModel::getrightneighbours
t_relationmap getrightneighbours(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0)
Definition: patternmodel.h:2806

PatternModel::computestats
void computestats()
Definition: patternmodel.h:1526

IndexedPatternModel::getleftneighbours
t_relationmap getleftneighbours(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0)
Definition: patternmodel.h:2764

PatternStoreInterface
Limited virtual interface to pattern stores.
Definition: interface.h:20

PatternModel::info
void info(std::ostream *OUT)
Definition: patternmodel.h:2002

PatternPointerModel::getmodelversion
int getmodelversion() const
Definition: patternmodel.h:3399

PatternModelInterface::getmodelversion
virtual int getmodelversion() const  =0

PatternSetModel::PatternSetModel
PatternSetModel(const std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:341

PatternModel::model_type
unsigned char model_type
Definition: patternmodel.h:528

PatternModel::minlength
virtual int minlength() const
Definition: patternmodel.h:1316

PatternModel::getmodelversion
virtual int getmodelversion() const
Definition: patternmodel.h:657

IndexedPatternModel::prunerelations
void prunerelations(t_relationmap &relations, unsigned int occurrencethreshold)
Definition: patternmodel.h:2539

PatternModel::occurrencecount
virtual unsigned int occurrencecount(const Pattern &pattern)
Definition: patternmodel.h:1321

PatternModel::histogram
void histogram(std::map< unsigned int, unsigned int > &hist, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1948

PatternSetModel::totaltokens
uint64_t totaltokens
Definition: patternmodel.h:303

PatternSetModel::getmodeltype
virtual int getmodeltype() const
Definition: patternmodel.h:358

PatternModel::posttrain
virtual void posttrain(const PatternModelOptions options)
Definition: patternmodel.h:558

KeyError
Definition: common.h:43

PatternModel::computeskipgrams
virtual int computeskipgrams(const PatternPointer &pattern, PatternModelOptions &options, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, const bool exhaustive=false)
Definition: patternmodel.h:1245

end
void end(Measurement &m)
Definition: benchmarks.cpp:156

PatternModelOptions::MINTOKENS_UNIGRAMS
int MINTOKENS_UNIGRAMS
Definition: patternmodel.h:121

IndexedCorpus
Class for reading an entire (class encoded) corpus into memory. It provides a reverse index by IndexR...
Definition: patternstore.h:44

t_relationmap
PatternMap< uint32_t, BaseValueHandler< uint32_t >, uint64_t > t_relationmap
Definition: patternmodel.h:224

PatternModelOptions::MINTOKENS_SKIPGRAMS
int MINTOKENS_SKIPGRAMS
Definition: patternmodel.h:116

PatternSetModel::totaltypes
uint64_t totaltypes
Definition: patternmodel.h:304

IndexedPatternModel::computecoveragestats
virtual void computecoveragestats(int category=0, int n=0)
Definition: patternmodel.h:2877

PatternModel::getreverseindex_right
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_right(const IndexReference ref)
Definition: patternmodel.h:1489

IndexedPatternModel::outputrelations
void outputrelations(const PatternPointer &pattern, t_relationmap &relations, ClassDecoder &classdecoder, std::ostream *OUT, const std::string label="RELATED-TO")
Definition: patternmodel.h:3080

PatternSetModel::maxlength
virtual int maxlength() const
Definition: patternmodel.h:487

COMPACT
Definition: patternmodel.h:88

PatternModelOptions::DEBUG
bool DEBUG
Output extra debug information.
Definition: patternmodel.h:151

PatternSetModel::const_iterator
PatternSet< uint64_t >::const_iterator const_iterator
Definition: patternmodel.h:482

Pattern::category
const PatternCategory category() const
Definition: pattern.cpp:42

ReverseIndexType
ReverseIndexType
Definition: patternmodel.h:85

PatternSetModel::version
unsigned char version() const
Definition: patternmodel.h:515

PatternModel::resetstats
virtual void resetstats()
Definition: patternmodel.h:1559

PatternModelInterface::tokens
virtual unsigned int tokens() const  =0

PatternModelOptions::DOREMOVEFLEXGRAMS
bool DOREMOVEFLEXGRAMS
Remove flexgrams from the model upon loading it.
Definition: patternmodel.h:147

PatternModelOptions::DOSKIPGRAMS
bool DOSKIPGRAMS
Load/extract skipgrams? (default: false)
Definition: patternmodel.h:134

IndexedPatternPointerModel
Definition: patternmodel.h:3441

ClassDecoder
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping...
Definition: classdecoder.h:43

PatternModel< ValueType, ValueHandler, MapType, PatternPointer >::const_iterator
MapType::const_iterator const_iterator
Definition: patternmodel.h:1307

IndexReference
Reference to a position in the corpus.
Definition: datatypes.h:33

INDEXEDPATTERNMODEL
Definition: patternmodel.h:75

PATTERNSETMODEL
Definition: patternmodel.h:77

PatternMap
A pattern map storing patterns and their values in a hash map (unordered_map).
Definition: patternstore.h:782

PatternModel::getdata
virtual ValueType * getdata(const PatternPointer &pattern, bool makeifnew=false)
Definition: patternmodel.h:1354

PatternMap::insert
void insert(const Pattern &pattern, ValueType &value)
Definition: patternstore.h:789

PatternSet::read
void read(std::istream *in, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:644

PatternStore< t_patternset, uint64_t, Pattern >::classencodingversion
unsigned char classencodingversion
Definition: patternstore.h:328

PatternModel::extractset
PatternSet< uint64_t > extractset(int minlength=1, int maxlength=1)
Definition: patternmodel.h:2147

PatternModel::model_version
unsigned char model_version
Definition: patternmodel.h:529

PatternModel::cache_grouptotal
std::map< int, std::map< int, unsigned int > > cache_grouptotal
total occurrences (used for frequency computation, within a group)
Definition: patternmodel.h:539

IndexedPatternPointerModel::getmodelversion
int getmodelversion() const
Definition: patternmodel.h:3498

PatternPointer::subngrams
int subngrams(std::vector< PatternPointer > &container, int minn=1, int maxn=9) const
Definition: pattern.cpp:1142

FLEXGRAM
Definition: pattern.h:56

PatternPointer::bytesize
const size_t bytesize() const
Definition: pattern.h:435

IndexedPatternModel::gettemplates
t_relationmap gettemplates(const Pattern &pattern, unsigned int occurrencethreshold=0)
Definition: patternmodel.h:2559

PatternSetModel::iterator
PatternSet< uint64_t >::iterator iterator
Definition: patternmodel.h:481

IndexedData::end
iterator end()
Definition: datatypes.h:115

PatternPointer::isgap
bool isgap(int i) const
Definition: pattern.cpp:126

PatternModel::has
virtual bool has(const Pattern &pattern) const
Definition: patternmodel.h:669

PatternSet::write
void write(std::ostream *out)
Definition: patternstore.h:632

PatternModelInterface::getmodeltype
virtual int getmodeltype() const  =0

PatternModel::frequency
double frequency(const Pattern &pattern)
Definition: patternmodel.h:1666

PatternPointerModel
Definition: patternmodel.h:3342

PATTERNALIGNMENTMODEL
Definition: patternmodel.h:78

PatternModel::totalwordtypesingroup
unsigned int totalwordtypesingroup(int category, int n)
Definition: patternmodel.h:1645

PatternModel::computeflexgrams_fromcooc
virtual int computeflexgrams_fromcooc()
Definition: patternmodel.h:2178

algorithms.h

Pattern::size
const size_t size() const
Definition: pattern.h:156

PatternModel< ValueType, ValueHandler, MapType, PatternPointer >::iterator
MapType::iterator iterator
Definition: patternmodel.h:1306

PatternModelOptions::DOREMOVEINDEX
bool DOREMOVEINDEX
Do not load index information (for indexed models), loads just the patterns without any counts...
Definition: patternmodel.h:144

PatternModel::maxn
int maxn
Definition: patternmodel.h:533

PatternSetModel::getinterface
PatternModelInterface * getinterface()
Definition: patternmodel.h:465

PatternModel::has
virtual bool has(const PatternPointer &pattern) const
Definition: patternmodel.h:672

PatternModelInterface::types
virtual unsigned int types()=0

PatternModel::add
virtual void add(const Pattern &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:1704

PatternModelOptions::QUIET
bool QUIET
Don't output to stderr.
Definition: patternmodel.h:150

IndexedCorpus::end
iterator end()
Definition: patternstore.h:224

PatternModel::prune
unsigned int prune(int threshold, int _n=0)
Definition: patternmodel.h:1728

PatternModelOptions::MAXBACKOFFLENGTH
int MAXBACKOFFLENGTH
Definition: patternmodel.h:127

PatternModel::findskipgrams
virtual std::vector< PatternPointer > findskipgrams(const PatternPointer &pattern, unsigned int occurrencethreshold=1, int maxskips=3)
Definition: patternmodel.h:1254

PatternModelInterface::occurrencecount
virtual unsigned int occurrencecount(const Pattern &pattern)=0

PatternModelOptions::MAXSKIPS
int MAXSKIPS
Maximum skips per skipgram.
Definition: patternmodel.h:137

PatternSetModel::has
virtual bool has(const PatternPointer &pattern) const
Definition: patternmodel.h:367

PatternSetModel::model_type
unsigned char model_type
Definition: patternmodel.h:301

PatternModel::write
void write(const std::string filename)
Definition: patternmodel.h:1299

PatternModelOptions
Options for Pattern Model loading and training.
Definition: patternmodel.h:111

IndexPattern
std::pair< IndexReference, PatternPointer > IndexPattern
Definition: patternstore.h:39

IndexedCorpus::sentencelength
int sentencelength(int sentence) const
Definition: pattern.cpp:1806

PatternModelOptions::PRUNENONSUBSUMED
int PRUNENONSUBSUMED
Definition: patternmodel.h:142

IndexReference::token
uint16_t token
Definition: datatypes.h:36

Pattern::ngrams
int ngrams(std::vector< Pattern > &container, const int n) const
Definition: pattern.cpp:1050

PatternModel::cache_n
std::set< int > cache_n
Definition: patternmodel.h:538

PatternMap::size
size_t size() const
Definition: patternstore.h:800

PatternMap::begin
iterator begin()
Definition: patternstore.h:810

getmodeltype
int getmodeltype(const std::string &filename)
Definition: patternmodel.cpp:4

IndexedPatternModel::getmodeltype
int getmodeltype() const
Definition: patternmodel.h:2271

PatternModel::outputcooc
virtual void outputcooc(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:2180

Pattern::subngrams
int subngrams(std::vector< Pattern > &container, int minn=1, int maxn=99) const
Definition: pattern.cpp:1120

INDEXEDPATTERNPOINTERMODEL
Definition: patternmodel.h:76

IndexedData::insert
void insert(IndexReference ref)
Definition: datatypes.h:106

IndexedData
Collection of references to position in the corpus (IndexReference). Used by Indexed Pattern models...
Definition: datatypes.h:86

PatternModel::print
virtual void print(std::ostream *out, ClassDecoder &decoder, const PatternType &pattern, bool endline=true)
Definition: patternmodel.h:1911

PatternModel::gapmasks
std::map< int, std::vector< uint32_t > > gapmasks
pre-computed masks representing possible gap configurations for various pattern lengths ...
Definition: patternmodel.h:545

PatternModel::reverseindex_internal
bool reverseindex_internal
Definition: patternmodel.h:564

PatternModel::getsubparents
virtual t_relationmap getsubparents(const Pattern &pattern, int=0, int=0, int=0)
Definition: patternmodel.h:2170

PatternSetModel::frequency
virtual double frequency(const Pattern &)
Definition: patternmodel.h:479

PatternSetModel::PatternSetModel
PatternSetModel()
Definition: patternmodel.h:311

PatternModel::coverage
double coverage(const Pattern &key)
Definition: patternmodel.h:1397

t_relationmap_double
PatternMap< double, BaseValueHandler< double >, uint64_t > t_relationmap_double
Definition: patternmodel.h:230

IndexedCorpus::bytesize
unsigned int bytesize() const
Definition: patternstore.h:118

PatternPointer::mask
uint32_t mask
Definition: pattern.h:362

PatternModel::getreverseindex_left
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_left(const IndexReference ref)
Definition: patternmodel.h:1507

PatternModel::computeskipgrams
virtual int computeskipgrams(const PatternPointer &pattern, int mintokens=2, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, std::vector< PatternPointer > *targetcontainer=NULL, const bool exhaustive=false, const int maxskips=3, const bool DEBUG=false)
Definition: patternmodel.h:1101

PatternPointer::category
const PatternCategory category() const
Definition: pattern.cpp:46

PatternModel::outputcooc_npmi
virtual void outputcooc_npmi(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:2179

PatternModel::train
virtual void train(std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:778

IndexedPatternPointerModel::add
void add(const PatternPointer &patternpointer, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:3516

IndexedCorpus::getsentence
PatternPointer getsentence(int sentence) const
Definition: pattern.cpp:1826

PatternModelOptions::DOREMOVENGRAMS
bool DOREMOVENGRAMS
Remove n-grams from the model upon loading it.
Definition: patternmodel.h:145

PatternModel::cache_categories
std::set< int > cache_categories
Definition: patternmodel.h:537

PatternModel::getreverseindex_bysentence
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_bysentence(int sentence)
Definition: patternmodel.h:1471

IndexedPatternPointerModel::add
void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:3507

IndexedPatternModel::outputrelations
void outputrelations(const PatternPointer &pattern, ClassDecoder &classdecoder, std::ostream *OUT, bool outputheader=true)
Definition: patternmodel.h:3101

PatternSetModel::write
void write(const std::string &filename)
Definition: patternmodel.h:455

IndexedPatternModel::train
virtual void train(const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:2329

PatternModel::getinterface
PatternModelInterface * getinterface()
Definition: patternmodel.h:765

UNINDEXEDPATTERNPOINTERMODEL
Definition: patternmodel.h:74

IndexedCorpus::begin
iterator begin()
Definition: patternstore.h:214

PatternSetModel::tokens
virtual unsigned int tokens() const
Definition: patternmodel.h:505

PatternPointer::data
unsigned char * data
Definition: pattern.h:360

PatternPointer::parts
int parts(std::vector< PatternPointer > &container) const
Definition: pattern.cpp:1337

IndexedCorpus::size
size_t size()
Definition: patternstore.h:261

classencoder.h
Class for encoding plain-text to binary class-encoded data.

IndexedPatternModel::computenpmi
void computenpmi(std::map< PatternPointer, t_relationmap_double > &coocmap, double threshold, bool right=true, bool left=true)
Definition: patternmodel.h:3141

Pattern::n
const size_t n() const
Definition: pattern.cpp:89

IndexedPatternModel::info
void info(std::ostream *OUT)
Definition: patternmodel.h:2340

IndexedPatternModel::pruneskipgrams
int pruneskipgrams(int threshold, int minskiptypes, int _n=0)
Definition: patternmodel.h:2845

PatternModel::coveragecount
unsigned int coveragecount(const Pattern &key)
Definition: patternmodel.h:1389

IndexedPatternModel
An indexed model mapping patterns to values, high-level interface. This is a specialised subclass of ...
Definition: patternmodel.h:2192

PatternModel::topthreshold
unsigned int topthreshold(int amount, int category=0, int size=0)
Definition: patternmodel.h:1967

reversemask
uint32_t reversemask(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:58

PatternModel::histogram
void histogram(std::ostream *OUT, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1988

PatternModel::occurrencecount
virtual unsigned int occurrencecount(const PatternPointer &pattern)
Definition: patternmodel.h:1330

IndexedData::const_iterator
std::vector< IndexReference >::const_iterator const_iterator
Definition: datatypes.h:110

PatternSet
A pattern store in the form of an unordered set (i.e, no duplicates). Stores only patterns...
Definition: patternstore.h:538

IndexedPatternModel::print
void print(std::ostream *out, ClassDecoder &decoder, const PatternPointer &pattern, bool endline=true)
Definition: patternmodel.h:2421

PatternModel::totaltypes
uint64_t totaltypes
Total number of unigram/word types in the original corpus, SO INCLUDING NOT COVERED BY THE MODEL! ...
Definition: patternmodel.h:531

InternalError
Definition: common.h:35

PatternModel::prunenotinset
unsigned int prunenotinset(const std::unordered_set< Pattern > &s, int _n)
Definition: patternmodel.h:1784

PatternSetModel::types
virtual unsigned int types()
Definition: patternmodel.h:498

PatternModel::cache_grouptotalwordtypes
std::map< int, std::map< int, unsigned int > > cache_grouptotalwordtypes
total covered word types per group
Definition: patternmodel.h:541

IndexedPatternModel::train
virtual void train(std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:2321

masktailskip
int masktailskip(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:77

PatternType
PatternType
Definition: pattern.h:59

PatternModel::computeflexgrams_fromskipgrams
virtual int computeflexgrams_fromskipgrams()
Definition: patternmodel.h:2177

PatternModelOptions::PatternModelOptions
PatternModelOptions(const PatternModelOptions &ref)
Definition: patternmodel.h:188

NoSuchPattern
Definition: patternmodel.h:97

PatternModel::prunebymodel
unsigned int prunebymodel(PatternModel< ValueType2, ValueHandler2, MapType2 > &secondmodel)
Definition: patternmodel.h:1811

PatternModelOptions::MINLENGTH
int MINLENGTH
The minimum length of patterns to be loaded/extracted (in words/tokens) (default: 1) ...
Definition: patternmodel.h:125

PatternModel::write
void write(std::ostream *out)
Definition: patternmodel.h:1279

PatternModel::types
virtual unsigned int types()
Definition: patternmodel.h:1368

PatternPointerModel::add
virtual void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:3408

PatternSetModel::maxn
int maxn
Definition: patternmodel.h:305

PatternPointer::pattern
Pattern pattern() const
Definition: pattern.h:527

PatternModel::size
virtual size_t size() const
Definition: patternmodel.h:662

PatternSetModel::minn
int minn
Definition: patternmodel.h:306

PatternSetModel::occurrencecount
virtual unsigned int occurrencecount(const Pattern &pattern)
Definition: patternmodel.h:473

PatternModel::load
virtual void load(std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:700

getdataversion
unsigned char getdataversion(std::istream *in)
Definition: classdecoder.cpp:257

PatternSet::insert
void insert(const Pattern &pattern)
Definition: patternstore.h:580

begin
Measurement begin(const string &title)
Definition: benchmarks.cpp:148

IndexedData::begin
iterator begin()
Definition: datatypes.h:112

Pattern::parts
int parts(std::vector< Pattern > &container) const
Definition: pattern.cpp:1225

PatternModelOptions::PatternModelOptions
PatternModelOptions()
Definition: patternmodel.h:157

PatternModel::print
virtual void print(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1854

PatternModel::tokens
virtual unsigned int tokens() const
Definition: patternmodel.h:1376

PatternSetModel::model_version
unsigned char model_version
Definition: patternmodel.h:302

IndexReference::sentence
uint32_t sentence
Definition: datatypes.h:35

PatternModel::cache_grouptotalpatterns
std::map< int, std::map< int, unsigned int > > cache_grouptotalpatterns
total distinct patterns per group
Definition: patternmodel.h:540

PatternSet::has
bool has(const Pattern &pattern) const
Definition: patternstore.h:587

PatternModelInterface::getstoreinterface
virtual PatternStoreInterface * getstoreinterface()
Definition: patternmodel.h:288

PatternModel::getpatterns
std::vector< std::pair< Pattern, int > > getpatterns(const Pattern &pattern)
Definition: patternmodel.h:1834

t_relationmap_double_iterator
PatternMap< double, BaseValueHandler< double >, uint64_t >::iterator t_relationmap_double_iterator
Definition: patternmodel.h:233

PatternModel::getrightneighbours
virtual t_relationmap getrightneighbours(const Pattern &pattern, int=0, int=0, int=0, int=0)
Definition: patternmodel.h:2175

IndexedPatternModel::trainskipgrams
virtual void trainskipgrams(PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL)
Definition: patternmodel.h:2453

PatternModel::hasskipgrams
bool hasskipgrams
Does this model have skipgrams?
Definition: patternmodel.h:565

QUICK
Definition: patternmodel.h:87

IndexedPatternModel::flexgramsize
int flexgramsize(const Pattern &pattern, IndexReference begin)
Definition: patternmodel.h:3300

PatternModel::cache_grouptotaltokens
std::map< int, std::map< int, unsigned int > > cache_grouptotaltokens
total covered tokens per group
Definition: patternmodel.h:542

PatternPointer::tostring
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:283

IndexedPatternModel::add
virtual void add(const PatternPointer &patternpointer, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:2288

PatternModel::postread
virtual void postread(const PatternModelOptions options)
Definition: patternmodel.h:547

PatternPointerModel::add
virtual void add(const PatternPointer &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:3428

PatternSetModel::size
virtual size_t size() const
Definition: patternmodel.h:361

compute_skip_configurations
vector< uint32_t > compute_skip_configurations(const int n, const int maxskips)
Definition: algorithms.cpp:85

PatternModel::totaloccurrencesingroup
unsigned int totaloccurrencesingroup(int category, int n)
Definition: patternmodel.h:1623

IndexedPatternModel::computeflexgrams_fromcooc
int computeflexgrams_fromcooc(double threshold)
Definition: patternmodel.h:3217

IndexedPatternModel::add
virtual void add(const Pattern &pattern, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:2282

SKIPGRAM
Definition: pattern.h:55

IndexReference::tostring
std::string tostring() const
Definition: datatypes.h:72

PatternSetModel::PatternSetModel
PatternSetModel(std::istream *f, PatternModelOptions options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:325

PatternModel::outputrelations
virtual void outputrelations(const Pattern &pattern, ClassDecoder &classdecoder, std::ostream *OUT)
Definition: patternmodel.h:2168

IndexedPatternModel::getrightcooc
t_relationmap getrightcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, IndexedData *matches=NULL)
Definition: patternmodel.h:2939

PatternModel::pruneskipgrams
virtual unsigned int pruneskipgrams(unsigned int threshold, int minskiptypes=2, int _n=0)
Definition: patternmodel.h:1758