66 #include "bz2stream.h" 
   98   virtual const char* what() 
const throw()
 
  100     return "Pattern not found in model";
 
  159             MINTOKENS_SKIPGRAMS = -1; 
 
  160             MINTOKENS_UNIGRAMS = 1; 
 
  163             MAXBACKOFFLENGTH = 100;
 
  168             DOSKIPGRAMS_EXHAUSTIVE = 
false;
 
  170             DOREVERSEINDEX = 
true; 
 
  171             DOPATTERNPERLINE = 
false;
 
  174             DOREMOVEINDEX = 
false; 
 
  175             DOREMOVENGRAMS = 
false;
 
  176             DOREMOVESKIPGRAMS = 
false;
 
  177             DOREMOVEFLEXGRAMS = 
false;
 
  179             PRUNENONSUBSUMED = 
false;
 
  280         virtual unsigned int types() =0;
 
  286         virtual unsigned int tokens() 
const=0;
 
  332             this->
load(f,options, constrainmodel);
 
  348             if (!options.
QUIET) std::cerr << 
"Loading " << filename << std::endl;
 
  349             std::ifstream * in = 
new std::ifstream(filename.c_str());
 
  351                 std::cerr << 
"ERROR: Unable to load file " << filename << std::endl;
 
  354             this->
load( (std::istream *) in, options, constrainmodel);
 
  378             if (!options.
QUIET) std::cerr << 
"Loading " << filename << 
" as set-model" << std::endl;
 
  379             std::ifstream * in = 
new std::ifstream(filename.c_str());
 
  381                 std::cerr << 
"ERROR: Unable to load file " << filename << std::endl;
 
  384             this->
load( (std::istream *) in, options, constrainmodel);
 
  396             f->read( (
char*) &null, 
sizeof(
char));        
 
  397             f->read( (
char*) &model_type, 
sizeof(
char));        
 
  398             f->read( (
char*) &model_version, 
sizeof(
char));       
 
  401                 std::cerr << 
"ERROR: File is not a colibri patternmodel file" << std::endl;
 
  404             if (model_version > 2) {
 
  405                 std::cerr << 
"WARNING: Model is created with a newer version of Colibri Core! Attempting to continue but failure is likely..." << std::endl;
 
  407             f->read( (
char*) &totaltokens, 
sizeof(uint64_t));        
 
  408             f->read( (
char*) &totaltypes, 
sizeof(uint64_t)); 
 
  411             if (constrainmodel) constrainstore = constrainmodel->getstoreinterface();
 
  414                 std::cerr << 
"Debug enabled, loading PatternModel type " << (int) model_type << 
", version " << (
int) model_version << 
", classencodingversion" << (int) this->
classencodingversion << std::endl;   
 
  415                 std::cerr << 
"Total tokens: " << totaltokens << 
", total types: " << totaltypes << std::endl;;   
 
  430                 std::cerr << 
"ERROR: Unknown model type" << std::endl;
 
  440             out->write( (
char*) &null, 
sizeof(
char));       
 
  442             out->write( (
char*) &t, 
sizeof(
char));        
 
  444             out->write( (
char*) &v, 
sizeof(
char));        
 
  445             out->write( (
char*) &totaltokens, 
sizeof(uint64_t));        
 
  446             const uint64_t tp = this->
types(); 
 
  447             out->write( (
char*) &tp, 
sizeof(uint64_t)); 
 
  455         void write(
const std::string & filename) {
 
  456             std::ofstream * out = 
new std::ofstream(filename.c_str());
 
  525 template<
class ValueType, 
class ValueHandler = BaseValueHandler<ValueType>, 
class MapType = PatternMap<ValueType, BaseValueHandler<ValueType>>, 
class PatternType = Pattern>
 
  553                 if (n > maxn) maxn = n;
 
  554                 if (n < minn) minn = n;
 
  575             hasskipgrams = 
false;
 
  579                 this->reverseindex = corpus;
 
  580                 this->attachcorpus(*corpus);
 
  582                 this->reverseindex = NULL;
 
  584             reverseindex_internal = 
false;
 
  599             hasskipgrams = 
false;
 
  602             this->
load(f,options,constrainmodel);
 
  604                 this->reverseindex = corpus;
 
  605                 this->attachcorpus(*corpus);
 
  607                 this->reverseindex = NULL;
 
  609             reverseindex_internal = 
false;
 
  613             if (reverseindex_internal && reverseindex != NULL) 
delete reverseindex;
 
  628             hasskipgrams = 
false;
 
  632                 this->reverseindex = corpus;
 
  633                 this->attachcorpus(*corpus);
 
  635                 this->reverseindex = NULL;
 
  637             reverseindex_internal = 
false;
 
  638             if (!options.QUIET) std::cerr << 
"Loading " << filename << std::endl;
 
  639             std::ifstream * in = 
new std::ifstream(filename.c_str());
 
  641                 std::cerr << 
"ERROR: Unable to load file " << filename << std::endl;
 
  644             this->
load( (std::istream *) in, options, constrainmodel);
 
  663             return MapType::size();
 
  670             return MapType::has(pattern);
 
  673             return MapType::has(pattern);
 
  683             if (!options.
QUIET) std::cerr << 
"Loading " << filename << std::endl;
 
  684             std::ifstream * in = 
new std::ifstream(filename.c_str());
 
  686                 std::cerr << 
"ERROR: Unable to load file " << filename << std::endl;
 
  689             this->
load( (std::istream *) in, options, constrainmodel);
 
  702             f->read( (
char*) &null, 
sizeof(
char));        
 
  703             f->read( (
char*) &model_type, 
sizeof(
char));        
 
  704             f->read( (
char*) &model_version, 
sizeof(
char));        
 
  705             if (model_version == 1) this->classencodingversion = 1;
 
  707                 std::cerr << 
"File is not a colibri model file (or a very old one)" << std::endl;
 
  710             if (model_version > 2) {
 
  711                 std::cerr << 
"WARNING: Model is created with a newer version of Colibri Core! Attempting to continue but failure is likely..." << std::endl;
 
  714                 std::cerr << 
"Debug enabled, loading PatternModel type " << (int) model_type << 
", version " << (
int) model_version << 
", classencodingversion=" << (int) this->classencodingversion << std::endl;   
 
  718                 if (options.
DEBUG) std::cerr << 
"Reading corpus data" << std::endl;
 
  719                 unsigned int corpussize;
 
  720                 f->read( (
char*) &corpussize, 
sizeof(
unsigned int));
 
  721                 unsigned char * corpusdata = 
new unsigned char[corpussize];
 
  722                 f->read((
char*) corpusdata,
sizeof(
unsigned char) * corpussize);
 
  724                 this->attachcorpus(*reverseindex);
 
  725                 reverseindex_internal = 
true;
 
  726                 if (options.
DEBUG) std::cerr << 
"(read " << corpussize << 
" bytes)" << std::endl;
 
  728             f->read( (
char*) &totaltokens, 
sizeof(uint64_t));        
 
  729             f->read( (
char*) &totaltypes, 
sizeof(uint64_t)); 
 
  735                 std::cerr << 
"Total tokens: " << totaltokens << 
", total types: " << totaltypes << std::endl;;   
 
  741                  MapType::template read<IndexedData,IndexedDataHandler,PatternType>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore,  !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS, options.
DORESET,   options.
DEBUG);
 
  744                  MapType::template read<uint32_t,BaseValueHandler<uint32_t>,
PatternType>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore, !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS, options.
DORESET,   options.
DEBUG);
 
  747                  MapType::template read<uint32_t,BaseValueHandler<uint32_t>,
PatternPointer>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore, !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS, options.
DORESET,   options.
DEBUG);
 
  750                  MapType::template read<IndexedData,IndexedDataHandler,PatternPointer>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore,  !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS, options.
DORESET,   options.
DEBUG);
 
  755                 MapType::template read<PatternFeatureVectorMap<double>,
PatternFeatureVectorMapHandler<double>,
PatternType>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore,  !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS,options.
DORESET,  options.
DEBUG);
 
  782             if (constrainbymodel == 
this) {
 
  785             } 
else if (constrainbymodel != NULL) {
 
  786                 totaltypes = constrainbymodel->types();
 
  787                 totaltokens = constrainbymodel->tokens();
 
  789             uint32_t sentence = firstsentence-1;
 
  792             bool iter_unigramsonly = 
false; 
 
  793             bool skipunigrams = 
false; 
 
  795                 iter_unigramsonly = 
true;
 
  798             if (!options.
QUIET) {
 
  799                 std::cerr << 
"Training patternmodel";
 
  800                 if (constrainbymodel != NULL) std::cerr << 
", constrained by another model";
 
  801                 std::cerr << 
", occurrence threshold: " << options.
MINTOKENS;
 
  802                 if (iter_unigramsonly) std::cerr << 
", secondary word occurrence threshold: " << options.
MINTOKENS_UNIGRAMS;
 
  803                 if (version < 2) std::cerr << 
", class encoding version: " << (int) version;
 
  804                 std::cerr << std::endl; 
 
  806             std::vector<std::pair<PatternPointer,int> > ngrams;
 
  807             std::vector<PatternPointer> subngrams;
 
  810             int prevsize = this->
size();
 
  811             if (constrainbymodel == 
this) prevsize = 0; 
 
  815             if (!this->data.empty()) {
 
  816                 if ((continued) && (!options.
QUIET)) std::cerr << 
"Continuing training on preloaded model, computing statistics..." << std::endl;
 
  820             for (
int n = 1; n <= options.
MAXLENGTH; n++) { 
 
  821                 bool skipgramsonly = 
false; 
 
  823                     if ((options.
MINTOKENS > 1) && (constrainbymodel == NULL)) {
 
  824                        if (cache_grouptotal[
NGRAM][n] > 0) {
 
  828                                 if (!options.
QUIET) std::cerr << 
"Skipping " << n << 
"-grams, already in model" << std::endl; 
 
  835                 int foundskipgrams = 0;
 
  844                 if (!options.
QUIET) {
 
  845                     if (iter_unigramsonly) {
 
  846                         std::cerr << 
"Counting unigrams using secondary word occurrence threshold (" << options.
MINTOKENS_UNIGRAMS << 
")" << std::endl;
 
  848                         std::cerr << 
"Counting patterns from list, one per line" << std::endl; 
 
  849                     } 
else if (constrainbymodel != NULL) {
 
  850                         std::cerr << 
"Counting n-grams that occur in constraint model" << std::endl;
 
  852                         std::cerr << 
"Counting " << n << 
"-grams" << std::endl; 
 
  853                         if (skipgramsonly) std::cerr << 
"(only counting skipgrams actually, n-grams already counted earlier)" << std::endl; 
 
  855                         std::cerr << 
"Counting *all* n-grams (occurrence threshold=1)" << std::endl;
 
  863                 sentence = firstsentence-1; 
 
  864                 bool singlepass = 
false;
 
  865                 const unsigned int sentences = (reverseindex != NULL) ? reverseindex->
sentences() : 0;
 
  866                 while (((reverseindex != NULL) && (sentence < sentences)) ||  ((reverseindex == NULL) && (in != NULL) && (!in->eof())))  {
 
  869                     if (linepattern != NULL) 
delete linepattern;
 
  870                     if (reverseindex == NULL) linepattern = 
new Pattern(in,
false,version);
 
  873                     const unsigned int linesize = line.
n();
 
  874                     if (options.
DEBUG) std::cerr << 
"Processing line " << sentence << 
", size (tokens) " << linesize << 
" (bytes) " << line.
bytesize() << 
", n=" << n <<  std::endl;
 
  880                     if ((n==1) && (!continued)) totaltokens += linesize;
 
  885                         if (linesize > (
unsigned int) options.
MAXLENGTH) 
continue;
 
  886                         ngrams.push_back(std::pair<PatternPointer,int>(line,0));
 
  888                         if (iter_unigramsonly) {
 
  890                         } 
else if ((options.
MINTOKENS > 1) && (constrainbymodel == NULL)) {
 
  895                             if (continued) minlength = this->maxn + 1;
 
  899                     if (options.
DEBUG) std::cerr << 
"\t" << ngrams.size() << 
" ngrams in line" << std::endl;
 
  903                     for (std::vector<std::pair<PatternPointer,int>>::
iterator iter = ngrams.begin(); iter != ngrams.end(); iter++) {
 
  906                             if ((singlepass) && (options.
MINLENGTH == 1) && (skipunigrams) && (iter->first.n() == 1)) {
 
  911                             if (!skipgramsonly) { 
 
  913                                 if ((constrainbymodel != NULL) && (!iter_unigramsonly) && (!constrainbymodel->has(iter->first))) 
continue; 
 
  921                                     iter->first.ngrams(subngrams,1); 
 
  922                                     for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
 
  938                                         iter->first.ngrams(subngrams, backoffn);
 
  939                                     for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
 
  940                                         if (!this->
has(*iter2)) { 
 
  949                                 if ((found) && (!skipgramsonly)) {
 
  950                                     if (options.
DEBUG) std::cerr << 
"\t\tAdding @" << ref.
sentence << 
":" << ref.
token << 
" n=" << iter->first.n() << 
" category=" <<(int) iter->first.category()<< std::endl;
 
  951                                     add(iter->first, ref);
 
  954                             if (((n >= 3) || (options.
MINTOKENS == 1)) 
 
  956                                 int foundskipgrams_thisround = this->
computeskipgrams(iter->first, options, &ref, NULL, constrainbymodel, 
true );
 
  957                                 if (foundskipgrams_thisround > 0) hasskipgrams = 
true;
 
  958                                 foundskipgrams += foundskipgrams_thisround; 
 
  960                         } 
catch (std::exception &e) {
 
  961                             std::cerr << 
"ERROR: An internal error has occured during training!!!" << std::endl;
 
  962                             if (ignoreerrors) 
continue;
 
  969                 if (!iter_unigramsonly) {
 
  970                     foundngrams = this->
size() - foundskipgrams - prevsize;
 
  972                     if ((foundngrams) || (foundskipgrams)) {
 
  973                         if (n > this->maxn) this->maxn = n;
 
  974                         if (n < this->minn) this->minn = n;
 
  976                         if (!options.
QUIET) std::cerr << 
"None found" << std::endl;
 
  977                         if (!continued) 
break;
 
  979                     if (!options.
QUIET) std::cerr << 
" Found " << foundngrams << 
" ngrams...";
 
  981                     if ((!continued) && ((constrainbymodel == NULL) or (constrainbymodel == 
this))) {
 
  982                         if ((options.
MINTOKENS > 1) && (n == 1)) {
 
  983                             totaltypes = this->
size(); 
 
  985                             if (!options.
QUIET) std::cerr << 
" computing total word types prior to pruning...";
 
  987                             if (!options.
QUIET) std::cerr << totaltypes << 
"...";                           
 
 1000                             this->
prune(-1, n-1);
 
 1001                             if (!options.
QUIET) std::cerr << 
" (pruned last iteration due to minimum length)" << pruned;
 
 1004                     if (!options.
QUIET) std::cerr << 
"pruned " << pruned;
 
 1005                     if (foundskipgrams) {
 
 1006                         unsigned int prunedextra;
 
 1007                         if ((options.
MINTOKENS == 1) || (constrainbymodel != NULL)) {
 
 1012                         if (prunedextra && !options.
QUIET) std::cerr << 
" plus " << prunedextra << 
" extra skipgrams..";
 
 1013                         pruned += prunedextra;
 
 1015                     if (!options.
QUIET) std::cerr << 
"...total kept: " << (foundngrams + foundskipgrams) - pruned << std::endl;
 
 1016                     if (((options.
MINTOKENS == 1) || (constrainbymodel != NULL))) 
break; 
 
 1018                     if (!options.
QUIET) std::cerr <<  
"found " << this->
size() << std::endl;
 
 1020                     if ((!continued) && ((constrainbymodel == NULL) or (constrainbymodel == 
this))) {
 
 1021                         if (!options.
QUIET) std::cerr << 
" computing total word types prior to pruning...";
 
 1022                         totaltypes = this->
size();  
 
 1023                         if (!options.
QUIET) std::cerr << totaltypes << 
"...";                           
 
 1028                     iter_unigramsonly = 
false; 
 
 1029                     if ((n == 1) && (options.
MINLENGTH ==1)) skipunigrams = 
true; 
 
 1033                 prevsize = this->
size();
 
 1050                 if (!options.
QUIET) std::cerr << 
"Pruning non-subsumed n-grams"  << std::endl;
 
 1053                 for (
int n = begin_n; n > 1; n--) {
 
 1054                     std::unordered_set<Pattern> subsumed; 
 
 1055                     unsigned int prunednonsubsumed = 0;
 
 1057                     while (iter != this->
end()) {
 
 1058                         const unsigned int pattern_n = iter->first.n();
 
 1059                         if (pattern_n == (
unsigned int) n) {
 
 1061                             iter->first.ngrams(subngrams, n-1);
 
 1062                             for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) subsumed.insert(
Pattern(*iter2));
 
 1067                     if (!options.
QUIET) std::cerr << 
" pruned " << prunednonsubsumed << 
" non-subsumed " << (n-1) << 
"-grams"  << std::endl;
 
 1071             if (linepattern != NULL) 
delete linepattern;
 
 1082             if ((filename.size() > 3) && (filename.substr(filename.size()-3) == 
".bz2")) {
 
 1083                 std::ifstream * in = 
new std::ifstream(filename.c_str(), std::ios::in|std::ios::binary);
 
 1084                 bz2istream * decompressor = 
new bz2istream(in->rdbuf());
 
 1085                 this->
train( (std::istream*) decompressor, options, constrainbymodel, continued, firstsentence, ignoreerrors);
 
 1086                 delete decompressor;
 
 1089                 std::ifstream * in = 
new std::ifstream(filename.c_str());
 
 1090                 this->
train((std::istream*) in, options, constrainbymodel, continued, firstsentence, ignoreerrors);
 
 1106             if (mintokens == -1) mintokens = 2;;
 
 1107             if (mintokens  <= 1) {
 
 1111             int foundskipgrams = 0;
 
 1112             const int n = pattern.
n();
 
 1113             std::vector<PatternPointer> subngrams;
 
 1119             for (std::vector<uint32_t>::iterator iter2 =  gapmasks[n].
begin(); iter2 != gapmasks[n].end(); iter2++, gapconf_i++) {
 
 1120                 if (*iter2 == 0) 
continue; 
 
 1125                     skipgram.
mask = *iter2;
 
 1128                         std::cerr << 
"Checking for: " << std::endl;
 
 1132                     if ((constrainbymodel != NULL) && (!constrainbymodel->has(skipgram))) 
continue;
 
 1135                         if ((
int) skipgram.
n() != n) {
 
 1136                             std::cerr << 
"Generated invalid skipgram, n=" << skipgram.
n() << 
", expected " << n << std::endl;
 
 1141                     bool skipgram_valid = 
true;
 
 1142                     if ((mintokens != 1) && (constrainbymodel == NULL)) {
 
 1143                         bool check_extra = 
false;
 
 1146                         skipgram.
ngrams(subngrams,n-1); 
 
 1147                         for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) { 
 
 1149                             if (!subpattern.
isgap(0) && !subpattern.
isgap(subpattern.
n() - 1)) {
 
 1154                                     std::cerr << 
"Subpattern: " << std::endl;
 
 1157                                 if (!this->
has(subpattern)) {
 
 1158                                     if (DEBUG) std::cerr << 
"  discarded" << std::endl;
 
 1159                                     skipgram_valid = 
false;
 
 1170                         if (!skipgram_valid) 
continue;
 
 1178                                 std::vector<PatternPointer> parts;
 
 1179                                 skipgram.
parts(parts);
 
 1180                                 for (std::vector<PatternPointer>::iterator iter3 = parts.begin(); iter3 != parts.end(); iter3++) {
 
 1182                                     if (!this->
has(part)) {
 
 1183                                         skipgram_valid = 
false;
 
 1187                                 if (!skipgram_valid) 
continue;
 
 1192                             const std::vector<std::pair<int,int>> gapconfiguration = 
mask2vector(skipgram.
mask, n);
 
 1193                             for (std::vector<std::pair<int,int>>::
const_iterator iter3 = gapconfiguration.begin(); iter3 != gapconfiguration.end(); iter3++) { 
 
 1194                                 if (!((iter3->first - 1 == 0) && (iter3->first + iter3->second + 1 == n))) { 
 
 1197                                         std::cerr << 
"Subskipgram: " << std::endl;
 
 1200                                     if (!this->
has(subskipgram)) {
 
 1201                                         if (DEBUG) std::cerr << 
"  discarded" << std::endl;
 
 1202                                         skipgram_valid = 
false;
 
 1211                     if (skipgram_valid) {
 
 1212                         if (DEBUG) std::cerr << 
"  counted!" << std::endl;
 
 1213                         if (targetcontainer == NULL) {
 
 1215                             if  (!
has(skipgram)) foundskipgrams++;
 
 1216                             if (singleref != NULL) {
 
 1217                                 add(skipgram, *singleref ); 
 
 1218                             } 
else if (multiplerefs != NULL) {
 
 1221                                     add(skipgram, ref ); 
 
 1224                                 std::cerr << 
"ERROR: computeskipgrams() called with no singleref and no multiplerefs" << std::endl;
 
 1230                             targetcontainer->push_back(skipgram);
 
 1236                     std::cerr << 
"IGNORING ERROR and continuing with next skipgram" << std::endl;
 
 1239             return foundskipgrams;
 
 1257             std::vector<PatternPointer> skipgrams;
 
 1267             std::cerr << 
"Can not compute skipgrams on unindexed model (except exhaustively during train() )" << std::endl;
 
 1274         void test(MapType & target, std::istream * in);
 
 1280             const char null = 0;
 
 1281             out->write( (
char*) &null, 
sizeof(
char));       
 
 1283             out->write( (
char*) &t, 
sizeof(
char));        
 
 1285             out->write( (
char*) &v, 
sizeof(
char));        
 
 1287                 out->write( (
char*) &this->corpussize, 
sizeof(
unsigned int));
 
 1288                 out->write((
char*) this->corpusstart, 
sizeof(
unsigned char) * this->corpussize);
 
 1290             out->write( (
char*) &totaltokens, 
sizeof(uint64_t));        
 
 1291             const uint64_t tp = this->
types(); 
 
 1292             out->write( (
char*) &tp, 
sizeof(uint64_t)); 
 
 1293             MapType::write(out); 
 
 1299         void write(
const std::string filename) {
 
 1300             std::ofstream * out = 
new std::ofstream(filename.c_str());
 
 1322             ValueType * data = this->
getdata(pattern);
 
 1324                 return this->valuehandler.count(*data); 
 
 1331             ValueType * data = this->
getdata(pattern);
 
 1333                 return this->valuehandler.count(*data); 
 
 1344             typename MapType::iterator iter = this->find(pattern);
 
 1345             if (iter != this->
end()) {
 
 1346                 return &(iter->second); 
 
 1347             } 
else if (makeifnew) {
 
 1348                 return &((*this)[pattern]);
 
 1355             typename MapType::iterator iter = this->find(pattern);
 
 1356             if (iter != this->
end()) {
 
 1357                 return &(iter->second); 
 
 1358             } 
else if (makeifnew) {
 
 1359                 return &((*this)[pattern]);
 
 1381         void output(std::ostream *);
 
 1410             std::vector<PatternPointer> result;
 
 1411             if (!this->reverseindex) 
return result;
 
 1414             const unsigned int minn = this->
minlength();
 
 1415             const unsigned int maxn = this->
maxlength();
 
 1416             for (
unsigned int n = minn; ref.
token + n <= sl && n <= 
maxn; n++) {
 
 1425                             && ((category == 0) || (ngram.
category() >= category)) ) {
 
 1426                             result.push_back(ngram);
 
 1428                             if (((category == 0) || (category == 
SKIPGRAM)) && (this->hasskipgrams))  {
 
 1434                                 std::vector<PatternPointer> skipgrams = this->
findskipgrams(ngram, occurrencecount);
 
 1435                                 for (
auto skipgram : skipgrams) {
 
 1436                                     result.push_back(skipgram);
 
 1473             std::vector<std::pair<IndexReference,PatternPointer>> result;
 
 1474             for (
int i = 0; i < this->reverseindex->
sentencelength(sentence); i++) {
 
 1477                 for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
 
 1479                     result.push_back(std::pair<IndexReference,PatternPointer>(ref,pattern));
 
 1491             std::vector<std::pair<IndexReference,PatternPointer>> result;
 
 1495                 for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
 
 1497                     result.push_back(std::pair<IndexReference,PatternPointer>(ref2,pattern));
 
 1509             std::vector<std::pair<IndexReference,PatternPointer>> result;
 
 1510             for (
int i = 0; i < ref.
token; i++) {
 
 1513                 for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
 
 1515                     result.push_back(std::pair<IndexReference,PatternPointer>(ref2,pattern));
 
 1527             cache_categories.clear();
 
 1529             cache_grouptotal.clear();
 
 1530             cache_grouptotalpatterns.clear();
 
 1531             cache_categories.insert(0);
 
 1534             while (iter != this->
end()) {
 
 1536                 const int c = pattern.category();
 
 1537                 cache_categories.insert(c);
 
 1538                 const int n = pattern.n();
 
 1544                     cache_grouptotal[c][n] += this->valuehandler.count(iter->second); 
 
 1545                     cache_grouptotal[0][n] += this->valuehandler.count(iter->second);
 
 1546                     cache_grouptotalpatterns[c][n]++;
 
 1547                     cache_grouptotalpatterns[0][n]++;
 
 1549                 cache_grouptotal[c][0] += this->valuehandler.count(iter->second);
 
 1550                 cache_grouptotal[0][0] += this->valuehandler.count(iter->second);
 
 1553                 cache_grouptotalpatterns[c][0]++;
 
 1554                 cache_grouptotalpatterns[0][0]++;
 
 1560             cache_grouptotalwordtypes.clear();
 
 1561             cache_grouptotaltokens.clear();
 
 1570             if ((cache_grouptotal.empty()) && (!this->data.empty())) this->
computestats();
 
 1575             for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {
 
 1576                 if ((category == 0) || (*iterc == category)) {
 
 1577                  for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {
 
 1578                   if (((n == 0) || (*itern == n)) && (cache_grouptotalwordtypes[*iterc][*itern] == 0) )  {
 
 1579                     std::unordered_set<Pattern> 
types;
 
 1581                     while (iter != this->
end()) {
 
 1583                         const int pn = (int) pattern.n();
 
 1584                         if ( (pn == 1) && (*itern <= 1) && ((*iterc == 0) || (pattern.category() == *iterc))) {
 
 1585                             types.insert(pattern);
 
 1587                             if (((*itern == 0) || (pn == *itern))  && ((*iterc == 0) || (pattern.category() == *iterc))) {
 
 1588                                 std::vector<PatternType> unigrams;
 
 1589                                 pattern.ngrams(unigrams, 1);
 
 1590                                 for (
typename std::vector<PatternType>::iterator iter2 = unigrams.begin(); iter2 != unigrams.end(); iter2++) {
 
 1596                         cache_grouptotaltokens[*iterc][*itern] += this->valuehandler.count(iter->second);
 
 1599                     cache_grouptotalwordtypes[*iterc][*itern] += types.size();
 
 1625             if ((cache_grouptotal.empty()) && (!this->data.empty())) this->
computestats();
 
 1626             return cache_grouptotal[category][n];
 
 1636             if ((cache_grouptotalpatterns.empty()) && (!this->data.empty())) this->
computestats();
 
 1637             return cache_grouptotalpatterns[category][n];
 
 1648             if ((cache_grouptotalwordtypes.empty()) && (!this->data.empty())) this->
computecoveragestats(category,n);
 
 1649             return cache_grouptotalwordtypes[category][n];
 
 1659             if ((cache_grouptotaltokens.empty()) && (!this->data.empty())) this->
computecoveragestats(category,n);
 
 1660             return cache_grouptotaltokens[category][n];
 
 1692             ValueType * data = 
getdata(pattern, 
true); 
 
 1693             this->
add(pattern, data, ref );
 
 1705             if (value == NULL) {
 
 1706                 std::cerr << 
"Add() value is NULL!" << std::endl;
 
 1709             this->valuehandler.add(value, ref);
 
 1712             if (value == NULL) {
 
 1713                 std::cerr << 
"Add() value is NULL!" << std::endl;
 
 1716             this->valuehandler.add(value, ref);
 
 1728         unsigned int prune(
int threshold,
int _n=0) {
 
 1731             unsigned int pruned = 0;
 
 1733             while (iter != this->
end()) {
 
 1735                 if (( (_n == 0) || (pattern.n() == (
unsigned int) _n) )&& ((threshold == -1) || (
occurrencecount(pattern) < (
unsigned int) threshold))) {
 
 1739                     iter = this->erase(iter); 
 
 1758         virtual unsigned int pruneskipgrams(
unsigned int threshold, 
int minskiptypes=2, 
int _n = 0) {
 
 1760             unsigned int pruned = 0;
 
 1761             if (minskiptypes <=1) 
return pruned; 
 
 1764             while(iter != this->
end()) { 
 
 1766                 if (( (_n == 0) || ((
int) pattern.n() == _n) ) && (pattern.category() == 
SKIPGRAM)) {
 
 1768                         iter = this->erase(iter);
 
 1785             unsigned int pruned = 0;
 
 1790             while (iter != this->
end()) {
 
 1792                 if ( (_n == 0) || (pattern.n() == (
unsigned int) _n) ) {
 
 1793                     if (s.find(pattern) == s.end()) {
 
 1795                         iter = this->erase(iter); 
 
 1810         template<
class ValueType2,
class ValueHandler2,
class MapType2>
 
 1816             unsigned int pruned = 0;
 
 1818             while(iter != this->
end()) { 
 
 1820                 if (!secondmodel.
has(pattern)) {
 
 1821                     iter = this->erase(iter);
 
 1836             std::vector<std::pair<Pattern, int> > v;   
 
 1837             std::vector<std::pair<Pattern, int> > ngrams;
 
 1839             for (std::vector<std::pair<Pattern, int> >::iterator iter = ngrams.begin(); iter != ngrams.end(); iter++) {
 
 1840                 const Pattern p = iter->first;
 
 1841                 if (this->
has(p)) v.push_back(*iter);
 
 1855             bool haveoutput = 
false;
 
 1858                     *out << 
"PATTERN\tCOUNT\tTOKENS\tCOVERAGE\tCATEGORY\tSIZE\tFREQUENCY" << std::endl;
 
 1862                 this->
print(out, decoder, pattern, 
true);
 
 1865                 std::cerr << std::endl << 
"Legend:" << std::endl;
 
 1866                 std::cerr << 
" - PATTERN    : The pattern, Gaps in skipgrams are represented as {*}. Variable-width gaps in flexgrams are shown using  {**}." << std::endl;
 
 1867                 std::cerr << 
" - COUNT      : The occurrence count - the amount of times the pattern occurs in the data" << std::endl;
 
 1868                 std::cerr << 
" - TOKENS     : The maximum number of tokens in the corpus that this pattern covers. *THIS IS JUST A MAXIMUM PROJECTION* rather than an exact number because your model is not indexed" << std::endl;
 
 1869                 std::cerr << 
" - COVERAGE   : The maximum number of tokens covered, as a fraction of the total in the corpus (projection)" << std::endl;
 
 1870                 std::cerr << 
" - CATEGORY   : The pattern type category (ngram,skipgram,flexgram)" << std::endl;
 
 1871                 std::cerr << 
" - SIZE       : The size of the pattern (in tokens)" << std::endl;
 
 1872                 std::cerr << 
" - FREQUENCY  : The frequency of the pattern *within it's pattern type category and size-class*." << std::endl;
 
 1873                 std::cerr << 
" - REFERENCES : A space-delimited list of sentence:token position where the pattern occurs in the data. Sentences start at 1, tokens at 0" << std::endl;
 
 1884             if (!this->reverseindex) 
return;
 
 1889                 for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 1891                     *out << 
"\t" << p.
tostring(decoder); 
 
 1903             this->
print(out, decoder); 
 
 1912             const std::string pattern_s = pattern.tostring(decoder);
 
 1916             const double freq = this->
frequency(pattern);
 
 1917             const int cat = pattern.category();
 
 1921             } 
else if (cat == 2) {
 
 1923             } 
else if (cat == 3) {
 
 1926             *out << pattern_s << 
"\t" << count << 
"\t" << 
"\t" << covcount << 
"\t" << coverage << 
"\t" << cat_s << 
"\t" << pattern.size() << 
"\t" << freq;
 
 1927             if (endline) *out << std::endl;
 
 1936             return this->
print(out,decoder,pattern,endline);
 
 1948         void histogram(std::map<unsigned int,unsigned int> & hist, 
unsigned int threshold = 0, 
unsigned int cap = 0, 
int category = 0, 
unsigned int size = 0) {
 
 1951                 if (((category != 0) && (pattern.category() != category)) || ((
size != 0) && (
size != pattern.size()))) 
continue;
 
 1953                 if (c >= threshold) hist[c]++;
 
 1956                 unsigned int sum = 0;
 
 1957                 std::map<unsigned int,unsigned int>::reverse_iterator iter = hist.rbegin();
 
 1958                 while ((sum < cap) && (iter != hist.rend())) {
 
 1960                     sum += iter->second; 
 
 1963                 hist.erase(iter.base(), hist.end());  
 
 1969             std::map<unsigned int,unsigned int> hist;
 
 1971             std::map<unsigned int,unsigned int>::reverse_iterator iter = hist.rbegin();
 
 1972             if (iter != hist.rend()) {
 
 1988         void histogram(std::ostream * OUT, 
unsigned int threshold = 0, 
unsigned int cap = 0 , 
int category = 0, 
unsigned int size = 0) {
 
 1989             std::map<unsigned int,unsigned int> hist;
 
 1991             *OUT << 
"HISTOGRAM" << std::endl;
 
 1992             *OUT << 
"------------------------------" << std::endl;
 
 1993             *OUT << 
"OCCURRENCES\tPATTERNS" << std::endl;
 
 1994             for (std::map<unsigned int,unsigned int>::iterator iter = hist.begin(); iter != hist.end(); iter++) {
 
 1995                 *OUT << iter->first << 
"\t" << iter->second << std::endl;
 
 2004                 *OUT << 
"Type: indexed" << std::endl;
 
 2006                 *OUT << 
"Type: unindexed" << std::endl;
 
 2009                 *OUT << 
"Type: unknown" << std::endl;
 
 2011             *OUT << 
"Total tokens: " << this->totaltokens << std::endl;
 
 2012             *OUT << 
"Total word types: " << this->totaltypes << std::endl;
 
 2013             *OUT << 
"Types patterns loaded: " << this->
size() << std::endl;
 
 2014             *OUT << 
"Min n: " << this->minn << std::endl;
 
 2015             *OUT << 
"Max n: " << this->maxn << std::endl;
 
 2016             if (this->reverseindex)  {
 
 2017                 *OUT << 
"Reverse index: yes" << std::endl;
 
 2018                 *OUT << 
"References in reverse index: " << this->reverseindex->
size() << std::endl;
 
 2020                 *OUT << 
"Reverse index: no" << std::endl;
 
 2022             *OUT << 
"Size of Pattern: " << 
sizeof(
Pattern) << 
" byte" << std::endl;
 
 2023             *OUT << 
"Size of ValueType: " << 
sizeof(ValueType) << 
" byte" << std::endl;
 
 2024             unsigned int totalkeybs = 0;
 
 2025             unsigned int totalvaluebs = 0;
 
 2028                 totalkeybs += 
sizeof(
PatternType) + pattern.bytesize();
 
 2029                 totalvaluebs += 
sizeof(ValueType); 
 
 2031             *OUT << 
"Total key bytesize (patterns): " <<  totalkeybs << 
" bytes (" << (totalkeybs/1024/1024) << 
" MB)" << std::endl;
 
 2032             *OUT << 
"Total value bytesize (counts/index): " <<  totalvaluebs << 
" bytes (" << (totalvaluebs/1024/1024) << 
" MB)" << std::endl;
 
 2033             *OUT << 
"Mean key bytesize: " << (totalkeybs / (float) this->
size()) << std::endl;
 
 2034             *OUT << 
"Mean value bytesize: " << (totalvaluebs / (float) this->
size()) << std::endl;
 
 2036             unsigned int ri_totalkeybs = 0;
 
 2037             unsigned int ri_totalvaluebs = 0;
 
 2038             if (this->reverseindex) {
 
 2040                     ri_totalkeybs += 
sizeof(iter->first.sentence) + 
sizeof(iter->first.token);
 
 2043                 *OUT << 
"Total key bytesize in reverse index (references): " <<  ri_totalkeybs << 
" bytes (" << (ri_totalkeybs/1024/1024) << 
" MB)" << std::endl;
 
 2044                 *OUT << 
"Total value bytesize in reverse index (patterns): " <<  ri_totalvaluebs << 
" bytes (" << (ri_totalvaluebs/1024/1024) << 
" MB)" << std::endl;
 
 2048             const unsigned int t = (totalkeybs + totalvaluebs + ri_totalkeybs + ri_totalvaluebs);
 
 2049             *OUT << 
"Total bytesize (without overhead): " << t << 
" bytes (" << (t/1024/1024) << 
" MB)" << std::endl;
 
 2057             if ((cache_grouptotaltokens.empty()) && (!this->data.empty())) {
 
 2058                 std::cerr << 
"Computing statistics..." << std::endl;
 
 2061             *OUT << std::setiosflags(std::ios::fixed) << std::setprecision(4) << std::endl;       
 
 2062             *OUT << 
"REPORT" << std::endl;
 
 2064                 *OUT << 
"   Warning: Model is unindexed, token coverage counts are mere maximal projections" << std::endl;
 
 2065                 *OUT << 
"            assuming no overlap at all!!! Use an indexed model for accurate coverage counts" << std::endl;
 
 2067             *OUT << 
"----------------------------------" << std::endl;
 
 2068             *OUT << 
"                          " << std::setw(15) << 
"PATTERNS" << std::setw(15) << 
"TOKENS" << std::setw(15) << 
"COVERAGE" << std::setw(15) << 
"TYPES" << std::setw(15) << std::endl;
 
 2069             *OUT << 
"Total:                    " << std::setw(15) << 
"-" << std::setw(15) << this->
tokens() << std::setw(15) << 
"-" << std::setw(15) << this->
types() <<  std::endl;
 
 2074             if (coveredtokens > this->
tokens()) coveredtokens = this->
tokens();
 
 2075             unsigned int uncoveredtokens = this->
tokens() - coveredtokens;
 
 2076             if (uncoveredtokens < 0) uncoveredtokens = 0;
 
 2077             *OUT << 
"Uncovered:                " << std::setw(15) << 
"-" << std::setw(15) << uncoveredtokens << std::setw(15) << uncoveredtokens / (double) this->
tokens() << std::setw(15) << this->
types() - coveredtypes <<  std::endl;
 
 2078             *OUT << 
"Covered:                  " << std::setw(15) << this->
size() << std::setw(15) << coveredtokens << std::setw(15) << coveredtokens / (double) this->
tokens() <<  std::setw(15) << coveredtypes <<  std::endl << std::endl;
 
 2082             bool haveoutput = 
false;
 
 2083             for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {
 
 2084                 const int c = *iterc;
 
 2085                 if (cache_grouptotalpatterns.count(c))
 
 2086                 for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {
 
 2087                     const int n = *itern;
 
 2088                     if (cache_grouptotalpatterns[c].count(n)) {
 
 2091                             *OUT << std::setw(15) << 
"CATEGORY" << std::setw(15) << 
"N (SIZE) "<< std::setw(15) << 
"PATTERNS";
 
 2093                             *OUT << std::setw(15) << 
"TYPES" << std::setw(15) << 
"OCCURRENCES" << std::endl;
 
 2098                             *OUT << std::setw(15) << 
"all";
 
 2099                         } 
else if (c == 
NGRAM) {
 
 2100                             *OUT << std::setw(15) << 
"n-gram";
 
 2102                             *OUT << std::setw(15) << 
"skipgram";
 
 2104                             *OUT << std::setw(15) << 
"flexgram";
 
 2108                             *OUT << std::setw(15) << 
"all";
 
 2110                             *OUT << std::setw(15) << n;
 
 2113                         *OUT << std::setw(15) << cache_grouptotalpatterns[c][n];
 
 2116                             *OUT << std::setw(15) << cache_grouptotaltokens[c][n];
 
 2118                             *OUT << std::setw(15) << cache_grouptotaltokens[c][n] / (double) this->
tokens();
 
 2121                         *OUT << std::setw(15) << cache_grouptotalwordtypes[c][n];
 
 2123                         *OUT << std::setw(15) << cache_grouptotal[c][n] << std::endl;;
 
 2129                 std::cerr << std::endl << 
"Legend:" << std::endl;
 
 2130                 std::cerr << 
" - PATTERNS    : The number of distinct patterns within the group" << std::endl;
 
 2132                     std::cerr << 
" - TOKENS      : The number of tokens that is covered by the patterns in the group." << std::endl;
 
 2133                     std::cerr << 
" - COVERAGE    : The number of tokens covered, as a fraction of the total in the corpus" << std::endl;
 
 2135                 std::cerr << 
" - TYPES       : The number of unique *word/unigram* types in this group" << std::endl;
 
 2136                 std::cerr << 
" - OCCURRENCES : The total number of occurrences of the patterns in this group" << std::endl;
 
 2152                 const int patternlength = pattern.n();
 
 2156                     std::vector<Pattern> subngrams;
 
 2158                     for (std::vector<Pattern>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
 
 2159                         const Pattern pattern2 = *iter2;
 
 2191 template<
class MapType = PatternMap<IndexedData,IndexedDataHandler>,
class PatternType = Pattern> 
 
 2196                 const Pattern p = iter->first;
 
 2197                 const int n = p.
n();
 
 2198                 if (n > this->
maxn) this->
maxn = n;
 
 2199                 if (n < this->
minn) this->minn = n;
 
 2204             if (!options.
QUIET) std::cerr << 
"Sorting all indices..." << std::endl;
 
 2206                 iter->second.sort();
 
 2221             this->attachcorpus(*corpus);
 
 2239             this->attachcorpus(*corpus);
 
 2243         this->
load(f,options, constrainmodel);
 
 2258             this->attachcorpus(*corpus);
 
 2262         std::ifstream * in = 
new std::ifstream(filename.c_str());
 
 2263         this->
load( (std::istream *) in, options, constrainmodel);
 
 2283         if (value == NULL) {
 
 2284             value = 
getdata(pattern,
true);
 
 2286         this->valuehandler.add(value, ref);
 
 2289         if (value == NULL) {
 
 2290             value = 
getdata(patternpointer,
true);
 
 2292         this->valuehandler.add(value, ref);
 
 2300         typename MapType::iterator iter = this->find(pattern);
 
 2301         if (iter != this->
end()) {
 
 2302             return &(iter->second); 
 
 2303         } 
else if (makeifnew) {
 
 2304             return &((*this)[pattern]);
 
 2311         typename MapType::iterator iter = this->find(pattern);
 
 2312         if (iter != this->
end()) {
 
 2313             return &(iter->second); 
 
 2314         } 
else if (makeifnew) {
 
 2315             return &((*this)[pattern]);
 
 2322         if ((options.
DOSKIPGRAMS) && (this->reverseindex == NULL)) {
 
 2323             std::cerr << 
"ERROR: You must specify a reverse index if you want to train skipgrams (or train skipgrams exhaustively)" << std::endl;
 
 2330         if ((options.
DOSKIPGRAMS) && (this->reverseindex == NULL)) {
 
 2331             std::cerr << 
"ERROR: You must specify a reverse index if you want to train skipgrams (or train skipgrams exhaustively)" << std::endl;
 
 2342             *OUT << 
"Type: indexed" << std::endl;
 
 2344             *OUT << 
"Type: unindexed" << std::endl;
 
 2347             *OUT << 
"Type: unknown" << std::endl;
 
 2349         *OUT << 
"Total tokens: " << this->
totaltokens << std::endl;
 
 2350         *OUT << 
"Total word types: " << this->
totaltypes << std::endl;
 
 2351         *OUT << 
"Types patterns loaded: " << this->
size() << std::endl;
 
 2352         *OUT << 
"Min n: " << this->
minn << std::endl;
 
 2353         *OUT << 
"Max n: " << this->
maxn << std::endl;
 
 2355             *OUT << 
"Reverse index: yes" << std::endl;
 
 2356             *OUT << 
"References in reverse index: " << this->
reverseindex->
size() << std::endl;
 
 2358             *OUT << 
"Reverse index: no" << std::endl;
 
 2360         *OUT << 
"Size of Pattern: " << 
sizeof(
Pattern) << 
" byte" << std::endl;
 
 2361         unsigned int totalkeybs = 0;
 
 2362         unsigned int totalvaluebs = 0;
 
 2363         unsigned int indexlengthsum = 0;
 
 2365             const Pattern pattern = iter->first;   
 
 2368             indexlengthsum += iter->second.size();
 
 2370         *OUT << 
"Total key bytesize (patterns): " << totalkeybs << 
" bytes (" << (totalkeybs/1024/1024) << 
" MB)" << std::endl;
 
 2371         *OUT << 
"Total value bytesize (counts/index): " << totalvaluebs << 
" bytes (" << (totalvaluebs/1024/1024) << 
" MB)" << std::endl;
 
 2372         *OUT << 
"Mean key bytesize: " << (totalkeybs / (float) this->
size()) << std::endl;
 
 2373         *OUT << 
"Mean value bytesize: " << (totalvaluebs / (float) this->
size()) << std::endl;
 
 2374         *OUT << 
"Mean index length (ttr): " << (indexlengthsum / (float) this->
size()) << std::endl;
 
 2376         unsigned int ri_totalkeybs = 0;
 
 2377         unsigned int ri_totalvaluebs = 0;
 
 2380                 ri_totalkeybs += 
sizeof(iter->first.sentence) + 
sizeof(iter->first.token);
 
 2383             *OUT << 
"Total key bytesize in reverse index (references): " << ri_totalkeybs << 
" bytes (" << (ri_totalkeybs/1024/1024) << 
" MB)" << std::endl;
 
 2384             *OUT << 
"Total value bytesize in reverse index (patterns): " << ri_totalvaluebs << 
" bytes (" << (ri_totalvaluebs/1024/1024) << 
" MB)" << std::endl;
 
 2387         const unsigned int t = (totalkeybs + totalvaluebs + ri_totalkeybs + ri_totalvaluebs);
 
 2388         *OUT << 
"Total bytesize (without overhead): " << t << 
" bytes (" << (t/1024/1024) << 
" MB)" << std::endl;
 
 2399         bool haveoutput = 
false;
 
 2402                 *out << 
"PATTERN\tCOUNT\tTOKENS\tCOVERAGE\tCATEGORY\tSIZE\tFREQUENCY\tREFERENCES" << std::endl;
 
 2406             this->
print(out, decoder, pattern, 
true);
 
 2409             std::cerr << std::endl << 
"Legend:" << std::endl;
 
 2410             std::cerr << 
" - PATTERN    : The pattern, Gaps in skipgrams are represented as {*}. Variable-width gaps in flexgrams are shown using {**}." << std::endl;
 
 2411             std::cerr << 
" - COUNT      : The occurrence count - the amount of times the pattern occurs in the data" << std::endl;
 
 2412             std::cerr << 
" - TOKENS     : The number of tokens in the corpus that this pattern covers" << std::endl;
 
 2413             std::cerr << 
" - COVERAGE   : The number of tokens covered, as a fraction of the total in the corpus" << std::endl;
 
 2414             std::cerr << 
" - CATEGORY   : The pattern type category (ngram,skipgram,flexgram)" << std::endl;
 
 2415             std::cerr << 
" - SIZE       : The size of the pattern (in tokens)" << std::endl;
 
 2416             std::cerr << 
" - FREQUENCY  : The frequency of the pattern *within it's pattern type category and size-class*." << std::endl;
 
 2417             std::cerr << 
" - REFERENCES : A space-delimited list of sentence:token position where the pattern occurs in the data. Sentences start at 1, tokens at 0" << std::endl;
 
 2422             const std::string pattern_s = pattern.
tostring(decoder);
 
 2426             const double freq = this->
frequency(pattern);
 
 2427             const int cat = pattern.
category();
 
 2431             } 
else if (cat == 2) {
 
 2433             } 
else if (cat == 3) {
 
 2436             *out << pattern_s << 
"\t" << count << 
"\t" << 
"\t" << covcount << 
"\t" << coverage << 
"\t" << cat_s << 
"\t" << pattern.
size() << 
"\t" << freq << 
"\t"; 
 
 2441                 *out << iter2->tostring();
 
 2442                 if (i < count) *out << 
" ";
 
 2444             if (endline) *out << std::endl;
 
 2456         for (
int n = 3; n <= options.
MAXLENGTH; n++) {
 
 2458             if (!options.
QUIET) std::cerr << 
"Counting " << n << 
"-skipgrams" << std::endl; 
 
 2459             int foundskipgrams = 0;
 
 2460             for (
typename MapType::iterator iter = this->
begin(); iter != this->
end(); iter++) {
 
 2463                 if (((
int) pattern.
n() == n) && (pattern.
category() == 
NGRAM) ) foundskipgrams += this->
computeskipgrams(pattern,options, NULL, &multirefs, constrainbymodel, 
false);
 
 2465             if (!foundskipgrams) {
 
 2466                 std::cerr << 
" None found" << std::endl;
 
 2471             if (!options.
QUIET) std::cerr << 
" Found " << foundskipgrams << 
" skipgrams...";
 
 2473             if (!options.
QUIET) std::cerr << 
"pruned " << pruned;
 
 2475             if (prunedextra && !options.
QUIET) std::cerr << 
" plus " << prunedextra << 
" extra skipgrams..";
 
 2476             if (!options.
QUIET) std::cerr << 
"...total kept: " <<  foundskipgrams - pruned - prunedextra << std::endl;
 
 2486             std::cerr << 
"ERROR: getpatternfromtoken() No reverse index loaded" << std::endl;
 
 2502             std::cerr << 
"ERROR: No corpus data loaded! (in PatternModel::getskipcontent)" << std::endl;
 
 2507             const unsigned int n = pattern.
n();
 
 2520                 skipcontent_atref_raw.
mask = skipcontent_mask;
 
 2524                 skipcontent[skipcontent_atref] += 1;
 
 2542         while (iter != relations.
end()) {
 
 2543             if (iter->second < occurrencethreshold) {
 
 2546                 relations.
erase(eraseiter);
 
 2563             std::cerr << 
"ERROR: No reverse index present" << std::endl;
 
 2575         const int _n = pattern.
n();
 
 2582             for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 2585                 if (((
int) candidate.
n() == _n)  && (candidate != pattern) && (candidate.
category() == 
SKIPGRAM)  && ((occurrencethreshold == 0) || (this->
occurrencecount(pattern) >= occurrencethreshold)) ) {
 
 2586                     templates[candidate] += 1;
 
 2590         if (occurrencethreshold > 0) this->
prunerelations(templates, occurrencethreshold);
 
 2604             std::cerr << 
"ERROR: No reverse index present" << std::endl;
 
 2616         const int _n = pattern.
n();
 
 2623             for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 2626                 if (((
int) candidate.
n() == _n)  && (candidate != pattern) && (candidate.
category() == 
NGRAM) && ((occurrencethreshold == 0) || (this->
occurrencecount(pattern) >= occurrencethreshold))  ) {
 
 2627                     instances[candidate] += 1;
 
 2631         if (occurrencethreshold > 0) this->
prunerelations(instances, occurrencethreshold);
 
 2644             std::cerr << 
"ERROR: No reverse index present" << std::endl;
 
 2657         const int _n = pattern.
n();
 
 2662             for (
int i = ref.
token; i < ref.
token + _n; i++) {
 
 2664                 int maxsubn = _n - (i - ref.
token);
 
 2670                 for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 2674                     if (((
int) candidate.
n() <= maxsubn) && (candidate != pattern)
 
 2675                         && ((occurrencethreshold == 0) || (this->
occurrencecount(candidate) >= occurrencethreshold))
 
 2676                         && ((category == 0) || (candidate.
category() >= category))
 
 2677                         && ((
size == 0) || (candidate.
n() >= 
size))
 
 2683                                 subchildren[candidate] = subchildren[candidate] + 1;
 
 2688                             subchildren[candidate]++;
 
 2694         if (occurrencethreshold > 0) this->
prunerelations(subchildren, occurrencethreshold);
 
 2709             std::cerr << 
"ERROR: No reverse index present" << std::endl;
 
 2719         const int _n = pattern.
n();
 
 2727             for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 2728                 if ((iter2->first.sentence != ref.
sentence) || (iter2->first.token > ref.
token)) 
break;
 
 2731                 int minsubsize = _n + (ref.
token - iter2->first.token);
 
 2733                 if (((
int) candidate.
n() >= minsubsize)  && (candidate != pattern)
 
 2734                         && ((occurrencethreshold == 0) || (this->
occurrencecount(candidate) >= occurrencethreshold))
 
 2735                         && ((category == 0) || (candidate.
category() >= category))
 
 2736                         && ((
size == 0) || (candidate.
n() >= 
size))
 
 2742                             subsumes[candidate] += 1;
 
 2747                         subsumes[candidate] += 1;
 
 2752         if (occurrencethreshold > 0) this->
prunerelations(subsumes, occurrencethreshold);
 
 2766             std::cerr << 
"ERROR: No reverse index present" << std::endl;
 
 2781             for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 2785                         && ((occurrencethreshold == 0) || (this->
occurrencecount(neighbour) >= occurrencethreshold))
 
 2786                         && ((category == 0) || (neighbour.
category() >= category))
 
 2787                         && ((
size == 0) || (neighbour.
n() >= 
size))
 
 2789                     neighbours[neighbour]++;
 
 2790                     if ((cutoff > 0) && (neighbours.
size() >= cutoff)) 
break;
 
 2793             if ((cutoff > 0) && (neighbours.
size() >= cutoff)) 
break;
 
 2795         if (occurrencethreshold > 0) this->
prunerelations(neighbours, occurrencethreshold);
 
 2808             std::cerr << 
"ERROR: No reverse index present" << std::endl;
 
 2824             for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 2826                 if ( ((occurrencethreshold == 0) || (this->
occurrencecount(neighbour) >= occurrencethreshold))
 
 2827                         && ((category == 0) || (neighbour.
category() >= category))
 
 2828                         && ((
size == 0) || (neighbour.
n() >= 
size)) ) {
 
 2829                     neighbours[neighbour]++;
 
 2830                     if ((cutoff > 0) && (neighbours.
size() >= cutoff)) 
break;
 
 2833             if ((cutoff > 0) && (neighbours.
size() >= cutoff)) 
break;
 
 2835         if (occurrencethreshold > 0) this->
prunerelations(neighbours, occurrencethreshold);
 
 2847         if (minskiptypes <=1) 
return pruned; 
 
 2850         while(iter != this->
end()) { 
 
 2852             if (( (_n == 0) || ((
int) pattern.n() == _n) ) && (pattern.category() == 
SKIPGRAM)) {
 
 2855                 if (skipcontent2.
size() != skipcontent.
size()) {
 
 2856                     std::cerr << 
" Pattern " << pattern.hash() << 
" discrepancy!!! " << skipcontent.
size() << 
" vs " << skipcontent2.
size() << std::endl;
 
 2860                 if ((
int) skipcontent.
size() < minskiptypes) { 
 
 2862                     iter = this->erase(iter);
 
 2882         if ((this->
cache_n.size() == 1) && (*this->
cache_n.begin() == 1) && (n <= 1)) {
 
 2886             while (iter != this->
end()) {
 
 2894           if ((category == 0) || (*iterc == category)) {
 
 2895             for (std::set<int>::iterator itern = this->
cache_n.begin(); itern != this->
cache_n.end(); itern++) {
 
 2897                 std::unordered_set<Pattern> 
types;
 
 2898                 std::set<IndexReference> 
tokens;
 
 2900                 while (iter != this->
end()) {
 
 2901                     const Pattern pattern = iter->first;                        
 
 2902                     const int n = pattern.
n();
 
 2903                     if ( (n == 1) && (*itern <= 1) && ((*iterc == 0) || (pattern.
category() == *iterc))) {
 
 2904                         types.insert(pattern);
 
 2906                         if (((*itern == 0) || (n == *itern))  && ((*iterc == 0) || (pattern.
category() == *iterc))) {
 
 2907                             std::vector<Pattern> unigrams;
 
 2908                             pattern.
ngrams(unigrams, 1);
 
 2909                             for (std::vector<Pattern>::iterator iter2 = unigrams.begin(); iter2 != unigrams.end(); iter2++) {
 
 2918                         for (
unsigned int i = 0; i < pattern.
n(); i++) {
 
 2919                             tokens.insert(*dataiter + i);
 
 2941             std::cerr << 
"ERROR: No reverse index present" << std::endl;
 
 2950         const int _n = pattern.
n();
 
 2958             for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 2962                         && ((occurrencethreshold == 0) || (this->
occurrencecount(neighbour) >= occurrencethreshold))
 
 2963                         && ((category == 0) || (neighbour.
category() >= category))
 
 2964                         && ((
size == 0) || (neighbour.
n() >= 
size))
 
 2967                     if (matches != NULL) matches->
insert(ref2);
 
 2971         if (occurrencethreshold > 0) this->
prunerelations(cooc, occurrencethreshold);
 
 2985             std::cerr << 
"ERROR: No reverse index present" << std::endl;
 
 2999             std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->
getreverseindex_left(ref);
 
 3000             for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 3003                 const int _n = neighbour.
n();
 
 3005                         && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))
 
 3006                         && ((category == 0) || (neighbour.
category() >= category))
 
 3007                         && ((
size == 0) || (neighbour.
n() >= 
size))
 
 3013         if (occurrencethreshold > 0) this->
prunerelations(cooc, occurrencethreshold);
 
 3028             std::cerr << 
"ERROR: No reverse index present" << std::endl;
 
 3037         const int _n = pattern.
n();
 
 3045             for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
 
 3048                 if ((ordersignificant) && (neighbour.
pattern() < pattern)) 
continue;
 
 3049                 const int _n2 = neighbour.
n();
 
 3051                         && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))
 
 3052                         && ((category == 0) || (neighbour.
category() >= category))
 
 3053                         && ((
size == 0) || (neighbour.
n() >= 
size))
 
 3059         if (occurrencethreshold > 0) this->
prunerelations(cooc, occurrencethreshold);
 
 3083             total += iter->second;
 
 3085         if (total == 0) 
return;
 
 3086         double total_f = total;
 
 3087         const std::string pattern_s = pattern.
tostring(classdecoder);
 
 3090             *OUT << 
"\t" << pattern_s << 
"\t" << label << 
"\t" << pattern2.
tostring(classdecoder) << 
"\t" << iter->second << 
"\t" << iter->second / total_f << 
"\t" << this->
occurrencecount(pattern2) << std::endl;
 
 3102         if (outputheader) *OUT << 
"#\tPATTERN1\tRELATION\tPATTERN2\tREL.COUNT\tREL.FREQUENCY\tCOUNT2" << std::endl;
 
 3105             this->
outputrelations(pattern, relations, classdecoder, OUT, 
"SUBSUMED-BY");
 
 3109             this->
outputrelations(pattern, relations, classdecoder, OUT, 
"SUBSUMES");
 
 3113             this->
outputrelations(pattern, relations, classdecoder, OUT, 
"RIGHT-NEIGHBOUR-OF");
 
 3117             this->
outputrelations(pattern, relations, classdecoder, OUT, 
"LEFT-NEIGHBOUR-OF");
 
 3121             this->
outputrelations(pattern, relations, classdecoder, OUT, 
"LEFT-COOC-OF");
 
 3125             this->
outputrelations(pattern, relations, classdecoder, OUT, 
"RIGHT-COOC-OF");
 
 3129             this->
outputrelations(pattern, relations, classdecoder, OUT, 
"INSTANTIATED-BY");
 
 3141     void computenpmi( std::map<PatternPointer,t_relationmap_double> &  coocmap , 
double threshold, 
bool right=
true, 
bool left=
true) { 
 
 3147             if ((right)&&(!left)) {
 
 3149             } 
else if ((left)&&(!right)) {
 
 3151             } 
else if (left && right) { 
 
 3156                 const double value = 
npmi(pattern,pattern2,iter2->second);
 
 3157                 if (value >= threshold) coocmap[pattern][pattern2] = value;
 
 3169     void computecooc( std::map<PatternPointer,t_relationmap> &  coocmap , 
int threshold, 
bool right=
true, 
bool left=
true) { 
 
 3173             if ((right)&&(!left)) {
 
 3175             } 
else if ((left)&&(!right)) {
 
 3177             } 
else if (left && right) { 
 
 3178                 tmp =  this->
getcooc(pattern, threshold);
 
 3182                 const double value = iter2->second;
 
 3183                 if (value >= threshold) coocmap[pattern][pattern2] = value;
 
 3197             if (pattern.category() == 
SKIPGRAM) {
 
 3198                 const PatternType flexgram = pattern.toflexgram();
 
 3199                 if (!this->
has(flexgram)) count++;
 
 3204                     this->data[flexgram].
insert(ref);
 
 3220         const unsigned char dynamicgap = 129;
 
 3228                 const double value = 
npmi(pattern,pattern2,iter2->second);
 
 3229                 if (value >= threshold) {
 
 3230                     const Pattern flexgram = pattern + dynamicpattern + pattern2;
 
 3231                     if (!this->
has(flexgram)) found++;
 
 3232                     this->data[flexgram] = value;
 
 3244         std::map<PatternPointer,t_relationmap_double> npmimap;
 
 3245         std::cerr << 
"Collecting patterns and computing NPMI..." << std::endl;
 
 3248         std::cerr << 
"Building inverse map..." << std::endl;
 
 3250         std::multimap<double,std::pair<PatternPointer,PatternPointer>> inversemap;
 
 3251         std::map<PatternPointer,t_relationmap_double>::iterator iter = npmimap.begin();
 
 3252         while (iter != npmimap.end()) {
 
 3254                 inversemap.insert(std::pair<
double,std::pair<PatternPointer,PatternPointer>>(iter2->second, std::pair<Pattern,Pattern>(iter->first, iter2->first)));
 
 3256             iter = npmimap.erase(iter);
 
 3259         *OUT << 
"Pattern1\tPattern2\tNPMI" << std::endl;
 
 3260         for (std::multimap<
double,std::pair<PatternPointer,PatternPointer>>::reverse_iterator iter2 = inversemap.rbegin(); iter2 != inversemap.rend(); iter2++) {
 
 3263             *OUT << pattern1.
tostring(classdecoder) << 
"\t" << pattern2.
tostring(classdecoder) << 
"\t" << iter2->first << std::endl;
 
 3272         std::map<PatternPointer,t_relationmap> coocmap;
 
 3273         std::cerr << 
"Collecting patterns and computing co-occurrence..." << std::endl;
 
 3276         std::cerr << 
"Building inverse map..." << std::endl;
 
 3278         std::multimap<uint32_t,std::pair<PatternPointer,PatternPointer>> inversemap;
 
 3279         std::map<PatternPointer,t_relationmap>::iterator iter = coocmap.begin();
 
 3280         while (iter != coocmap.end()) {
 
 3282                 inversemap.insert(std::pair<uint32_t,std::pair<PatternPointer,PatternPointer>>(iter2->second, std::pair<PatternPointer,PatternPointer>(iter->first, iter2->first)));
 
 3284             iter = coocmap.erase(iter);
 
 3287         *OUT << 
"Pattern1\tPattern2\tCooc" << std::endl;
 
 3288         for (std::multimap<uint32_t,std::pair<PatternPointer,PatternPointer>>::reverse_iterator iter2 = inversemap.rbegin(); iter2 != inversemap.rend(); iter2++) {
 
 3289             const Pattern pattern1 = iter2->second.first;
 
 3290             const Pattern pattern2 = iter2->second.second;
 
 3291             *OUT << pattern1.
tostring(classdecoder) << 
"\t" << pattern2.
tostring(classdecoder) << 
"\t" << iter2->first << std::endl;
 
 3304         std::vector<Pattern> parts;
 
 3305         int numberofparts = pattern.
parts(parts);
 
 3306         bool strictbegin = 
true;
 
 3307         std::multimap<int, IndexReference> partmatches;
 
 3310         for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter = rindex.begin(); iter != rindex.end(); iter++) {
 
 3313             partmatches.insert(std::pair<int,IndexReference>(i, ref));
 
 3317         int firsttoken = begin.
token;
 
 3319         for (
int j = 0; j < numberofparts; j++) {
 
 3323            for (std::multimap<int, IndexReference>::iterator iter = partmatches.lower_bound(j); iter != partmatches.upper_bound(j); iter++) {
 
 3325                 if (iter->first != prevlevel) {
 
 3329                 if (((iter->second == begin) || (begin < iter->second)) && (iter->second + parts[j].n() + 1 < nextbegin)) {
 
 3330                     nextbegin = iter->second + parts[j].n() + 1;
 
 3332                 prevlevel = iter->first;
 
 3334            if (!found) 
return 0;
 
 3336         return (nextbegin.
token - firsttoken);
 
 3341 template<
class ValueType, 
class ValueHandler = BaseValueHandler<ValueType>, 
class MapType = PatternPo
interMap<ValueType, BaseValueHandler<ValueType>>>
 
 3350                 this->attachcorpus(*corpus);
 
 3369                 this->attachcorpus(*corpus);
 
 3373             this->
load(f,options, constrainmodel);
 
 3388                 this->attachcorpus(*corpus);
 
 3392             std::ifstream * in = 
new std::ifstream(filename.c_str());
 
 3393             this->
load( (std::istream *) in, options, constrainmodel);
 
 3409             if ((patternpointer.
data < this->reverseindex->beginpointer()) || (patternpointer.
data > this->reverseindex->beginpointer() + this->
reverseindex->
bytesize())) {
 
 3410                 std::cerr << 
"Pattern Pointer points outside contained corpus data..." << std::endl;
 
 3413             ValueType * data = this->
getdata(patternpointer, 
true); 
 
 3417             this->
add(patternpointer, data, ref );
 
 3429             if (value == NULL) {
 
 3430                 std::cerr << 
"Add() value is NULL!" << std::endl;
 
 3433             this->valuehandler.add(value, ref);
 
 3440 template<
class MapType=PatternPo
interMap<IndexedData, IndexedDataHandler>>
 
 3449                 this->attachcorpus(*corpus);
 
 3468                 this->attachcorpus(*corpus);
 
 3472             this->
load(f,options, constrainmodel);
 
 3487                 this->attachcorpus(*corpus);
 
 3491             std::ifstream * in = 
new std::ifstream(filename.c_str());
 
 3492             this->
load( (std::istream *) in, options, constrainmodel);
 
 3508             if ((patternpointer.
data < this->reverseindex->beginpointer()) || (patternpointer.
data > this->reverseindex->beginpointer() + this->
reverseindex->
bytesize())) {
 
 3509                 std::cerr << 
"Pattern Pointer points outside contained corpus data..." << std::endl;
 
 3513             this->
add(patternpointer, data, ref );
 
 3517             if (value == NULL) {
 
 3518                 value = this->
getdata(patternpointer,
true);
 
 3520             this->valuehandler.add(value, ref);
 
void outputcooc(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:3271
 
bool out() const 
Definition: pattern.cpp:345
 
virtual int minlength() const  =0
 
void write(std::ostream *out)
Definition: patternmodel.h:438
 
void print(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:2398
 
unsigned char version() const 
Definition: patternmodel.h:1379
 
int minn
Definition: patternmodel.h:534
 
void report(std::ostream *OUT)
Definition: patternmodel.h:2056
 
virtual t_relationmap gettemplates(const Pattern &pattern, int=0)
Definition: patternmodel.h:2171
 
virtual t_relationmap getskipcontent(const PatternPointer &pattern)
Definition: patternmodel.h:2173
 
int maskheadskip(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:68
 
virtual t_relationmap_double getnpmi(const Pattern &pattern, double threshold)
Definition: patternmodel.h:2176
 
virtual void posttrain(const PatternModelOptions options)
Definition: patternmodel.h:2203
 
virtual void printreverseindex(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1883
 
virtual void load(std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:377
 
virtual int minlength() const 
Definition: patternmodel.h:492
 
unsigned char type() const 
Definition: patternmodel.h:1378
 
Definition: patternmodel.h:86
 
virtual t_relationmap getinstances(const Pattern &pattern, int=0)
Definition: patternmodel.h:2172
 
int MINSKIPTYPES
Minimum required amount of distinct patterns that can fit in a gap of a skipgram for the skipgram to ...
Definition: patternmodel.h:136
 
size_t size() const 
Definition: patternstore.h:597
 
const size_t size() const 
Definition: pattern.h:436
 
int ngrams(std::vector< PatternPointer > &container, const int n) const 
Definition: pattern.cpp:1072
 
int MAXLENGTH
The maximum length of patterns to be loaded/extracted, inclusive (in words/tokens) (default: 100) ...
Definition: patternmodel.h:126
 
IndexedData * getdata(const Pattern &pattern, bool makeifnew=false)
Definition: patternmodel.h:2299
 
Pattern getpatternfromtoken(IndexReference ref)
Definition: patternmodel.h:2484
 
unsigned int totaltokensingroup(int category, int n)
Definition: patternmodel.h:1656
 
void printpattern(std::ostream *out, ClassDecoder &decoder, const Pattern &pattern, bool endline=true)
Definition: patternmodel.h:1935
 
bool erase(const Pattern &pattern)
Definition: patternstore.h:821
 
IndexedCorpus * reverseindex
Pointer to the reverse index and corpus data for this model (or NULL) 
Definition: patternmodel.h:563
 
std::vector< PatternPointer > getreverseindex(const IndexReference ref, int occurrencecount=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1408
 
Definition: datatypes.h:477
 
IndexedData * getdata(const PatternPointer &pattern, bool makeifnew=false)
Definition: patternmodel.h:2310
 
unsigned int totalpatternsingroup(int category, int n)
Definition: patternmodel.h:1634
 
virtual int maxlength() const 
Definition: patternmodel.h:1312
 
bool DOREVERSEINDEX
Obsolete now, only here for backward-compatibility with v1. 
Definition: patternmodel.h:139
 
virtual void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:1680
 
virtual bool has(const Pattern &pattern) const 
Definition: patternmodel.h:364
 
PatternMap< uint32_t, BaseValueHandler< uint32_t >, uint64_t >::iterator t_relationmap_iterator
Definition: patternmodel.h:232
 
virtual void load(std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:394
 
virtual double frequency(const Pattern &)=0
 
bool empty() const 
Definition: patternstore.h:270
 
unsigned char type() const 
Definition: patternmodel.h:511
 
Definition: patternmodel.h:73
 
std::string tostring(const ClassDecoder &classdecoder) const 
Definition: pattern.cpp:278
 
virtual void load(std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:682
 
t_relationmap getsubchildren(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2642
 
virtual t_relationmap getsubchildren(const Pattern &pattern, int=0, int=0, int=0)
Definition: patternmodel.h:2169
 
virtual void add(const PatternPointer &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:1711
 
const bool isskipgram() const 
Definition: pattern.h:170
 
void test(MapType &target, std::istream *in)
 
void printmodel(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1902
 
void outputcooc_npmi(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:3243
 
void computecooc(std::map< PatternPointer, t_relationmap > &coocmap, int threshold, bool right=true, bool left=true)
Definition: patternmodel.h:3169
 
int MINTOKENS
Definition: patternmodel.h:113
 
int getmodelversion() const 
Definition: patternmodel.h:2272
 
Definition: pattern.h:357
 
bool instanceof(const Pattern &skipgram) const 
Definition: pattern.cpp:1533
 
t_relationmap getleftcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2983
 
t_relationmap getskipcontent(const PatternPointer &pattern)
Definition: patternmodel.h:2499
 
std::vector< IndexReference >::iterator iterator
Definition: datatypes.h:109
 
bool DOPATTERNPERLINE
Assume each line contains one integral pattern, rather than actively extracting all subpatterns on a ...
Definition: patternmodel.h:140
 
Contains lower-level containers for patterns. 
 
double comparemodels_loglikelihood(const Pattern pattern, std::vector< PatternModel< uint32_t > * > &models)
Definition: patternmodel.cpp:23
 
virtual void computecoveragestats(int category=0, int n=0)
Definition: patternmodel.h:1569
 
virtual ValueType * getdata(const Pattern &pattern, bool makeifnew=false)
Definition: patternmodel.h:1343
 
bool DOREMOVESKIPGRAMS
Remove skip-grams from the model upon loading it. 
Definition: patternmodel.h:146
 
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
 
virtual void postread(const PatternModelOptions options)
Definition: patternmodel.h:2194
 
unsigned int sentences() const 
Definition: patternstore.h:150
 
virtual void train(const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:1081
 
Definition: patternstore.h:156
 
A pattern model based on an unordered set, does not hold data, only patterns. Very suitable for loadi...
Definition: patternmodel.h:299
 
PatternPointer getpattern(const IndexReference &begin, int length=1) const 
Definition: pattern.cpp:1764
 
int getmodeltype() const 
Definition: patternmodel.h:3497
 
iterator end()
Definition: patternstore.h:813
 
virtual int getmodelversion() const 
Definition: patternmodel.h:359
 
ModelType
Definition: patternmodel.h:72
 
const size_t bytesize() const 
Definition: pattern.cpp:57
 
t_relationmap getcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, bool ordersignificant=false)
Definition: patternmodel.h:3026
 
t_relationmap getsubparents(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2705
 
virtual void trainskipgrams(const PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL)
Definition: patternmodel.h:1266
 
double npmi(const PatternPointer &key1, const PatternPointer &key2, int jointcount)
Definition: patternmodel.h:3067
 
virtual int getmodeltype() const 
Definition: patternmodel.h:653
 
t_relationmap getinstances(const Pattern &pattern, unsigned int occurrencethreshold=0)
Definition: patternmodel.h:2601
 
A model mapping patterns to values, gigh-level interface. 
Definition: patternmodel.h:526
 
const size_t n() const 
Definition: pattern.cpp:93
 
int computeflexgrams_fromskipgrams()
Definition: patternmodel.h:3192
 
int getmodeltype() const 
Definition: patternmodel.h:3398
 
std::unordered_map< Pattern, ValueType >::iterator iterator
Definition: patternstore.h:807
 
bool DORESET
sets all counts to zero upon loading, clears indices 
Definition: patternmodel.h:148
 
uint64_t totaltokens
Total number of tokens in the original corpus, so INCLUDES TOKENS NOT COVERED BY THE MODEL! ...
Definition: patternmodel.h:530
 
virtual int maxlength() const  =0
 
bool DOSKIPGRAMS_EXHAUSTIVE
Load/extract skipgrams in an exhaustive fashion? More memory intensive, but the only options for unin...
Definition: patternmodel.h:135
 
void output(std::ostream *)
 
vector< pair< int, int > > mask2vector(const uint32_t mask, const int n)
Definition: algorithms.cpp:35
 
Basic read-only interface for pattern models, abstract base class. 
Definition: interface.h:39
 
virtual t_relationmap getleftneighbours(const Pattern &pattern, int=0, int=0, int=0, int=0)
Definition: patternmodel.h:2174
 
t_relationmap getrightneighbours(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0)
Definition: patternmodel.h:2806
 
void computestats()
Definition: patternmodel.h:1526
 
t_relationmap getleftneighbours(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0)
Definition: patternmodel.h:2764
 
Limited virtual interface to pattern stores. 
Definition: interface.h:20
 
void info(std::ostream *OUT)
Definition: patternmodel.h:2002
 
int getmodelversion() const 
Definition: patternmodel.h:3399
 
virtual int getmodelversion() const  =0
 
PatternSetModel(const std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:341
 
unsigned char model_type
Definition: patternmodel.h:528
 
virtual int minlength() const 
Definition: patternmodel.h:1316
 
virtual int getmodelversion() const 
Definition: patternmodel.h:657
 
void prunerelations(t_relationmap &relations, unsigned int occurrencethreshold)
Definition: patternmodel.h:2539
 
virtual unsigned int occurrencecount(const Pattern &pattern)
Definition: patternmodel.h:1321
 
void histogram(std::map< unsigned int, unsigned int > &hist, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1948
 
uint64_t totaltokens
Definition: patternmodel.h:303
 
virtual int getmodeltype() const 
Definition: patternmodel.h:358
 
virtual void posttrain(const PatternModelOptions options)
Definition: patternmodel.h:558
 
virtual int computeskipgrams(const PatternPointer &pattern, PatternModelOptions &options, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, const bool exhaustive=false)
Definition: patternmodel.h:1245
 
void end(Measurement &m)
Definition: benchmarks.cpp:156
 
int MINTOKENS_UNIGRAMS
Definition: patternmodel.h:121
 
Class for reading an entire (class encoded) corpus into memory. It provides a reverse index by IndexR...
Definition: patternstore.h:44
 
PatternMap< uint32_t, BaseValueHandler< uint32_t >, uint64_t > t_relationmap
Definition: patternmodel.h:224
 
int MINTOKENS_SKIPGRAMS
Definition: patternmodel.h:116
 
uint64_t totaltypes
Definition: patternmodel.h:304
 
virtual void computecoveragestats(int category=0, int n=0)
Definition: patternmodel.h:2877
 
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_right(const IndexReference ref)
Definition: patternmodel.h:1489
 
void outputrelations(const PatternPointer &pattern, t_relationmap &relations, ClassDecoder &classdecoder, std::ostream *OUT, const std::string label="RELATED-TO")
Definition: patternmodel.h:3080
 
virtual int maxlength() const 
Definition: patternmodel.h:487
 
Definition: patternmodel.h:88
 
bool DEBUG
Output extra debug information. 
Definition: patternmodel.h:151
 
PatternSet< uint64_t >::const_iterator const_iterator
Definition: patternmodel.h:482
 
const PatternCategory category() const 
Definition: pattern.cpp:42
 
ReverseIndexType
Definition: patternmodel.h:85
 
unsigned char version() const 
Definition: patternmodel.h:515
 
virtual void resetstats()
Definition: patternmodel.h:1559
 
virtual unsigned int tokens() const  =0
 
bool DOREMOVEFLEXGRAMS
Remove flexgrams from the model upon loading it. 
Definition: patternmodel.h:147
 
bool DOSKIPGRAMS
Load/extract skipgrams? (default: false) 
Definition: patternmodel.h:134
 
Definition: patternmodel.h:3441
 
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping...
Definition: classdecoder.h:43
 
MapType::const_iterator const_iterator
Definition: patternmodel.h:1307
 
Reference to a position in the corpus. 
Definition: datatypes.h:33
 
Definition: patternmodel.h:75
 
Definition: patternmodel.h:77
 
A pattern map storing patterns and their values in a hash map (unordered_map). 
Definition: patternstore.h:782
 
virtual ValueType * getdata(const PatternPointer &pattern, bool makeifnew=false)
Definition: patternmodel.h:1354
 
void insert(const Pattern &pattern, ValueType &value)
Definition: patternstore.h:789
 
void read(std::istream *in, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:644
 
unsigned char classencodingversion
Definition: patternstore.h:328
 
PatternSet< uint64_t > extractset(int minlength=1, int maxlength=1)
Definition: patternmodel.h:2147
 
unsigned char model_version
Definition: patternmodel.h:529
 
std::map< int, std::map< int, unsigned int > > cache_grouptotal
total occurrences (used for frequency computation, within a group) 
Definition: patternmodel.h:539
 
int getmodelversion() const 
Definition: patternmodel.h:3498
 
int subngrams(std::vector< PatternPointer > &container, int minn=1, int maxn=9) const 
Definition: pattern.cpp:1142
 
const size_t bytesize() const 
Definition: pattern.h:435
 
t_relationmap gettemplates(const Pattern &pattern, unsigned int occurrencethreshold=0)
Definition: patternmodel.h:2559
 
PatternSet< uint64_t >::iterator iterator
Definition: patternmodel.h:481
 
iterator end()
Definition: datatypes.h:115
 
bool isgap(int i) const 
Definition: pattern.cpp:126
 
virtual bool has(const Pattern &pattern) const 
Definition: patternmodel.h:669
 
void write(std::ostream *out)
Definition: patternstore.h:632
 
virtual int getmodeltype() const  =0
 
double frequency(const Pattern &pattern)
Definition: patternmodel.h:1666
 
Definition: patternmodel.h:3342
 
Definition: patternmodel.h:78
 
unsigned int totalwordtypesingroup(int category, int n)
Definition: patternmodel.h:1645
 
virtual int computeflexgrams_fromcooc()
Definition: patternmodel.h:2178
 
const size_t size() const 
Definition: pattern.h:156
 
MapType::iterator iterator
Definition: patternmodel.h:1306
 
bool DOREMOVEINDEX
Do not load index information (for indexed models), loads just the patterns without any counts...
Definition: patternmodel.h:144
 
int maxn
Definition: patternmodel.h:533
 
PatternModelInterface * getinterface()
Definition: patternmodel.h:465
 
virtual bool has(const PatternPointer &pattern) const 
Definition: patternmodel.h:672
 
virtual unsigned int types()=0
 
virtual void add(const Pattern &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:1704
 
bool QUIET
Don't output to stderr. 
Definition: patternmodel.h:150
 
iterator end()
Definition: patternstore.h:224
 
unsigned int prune(int threshold, int _n=0)
Definition: patternmodel.h:1728
 
int MAXBACKOFFLENGTH
Definition: patternmodel.h:127
 
virtual std::vector< PatternPointer > findskipgrams(const PatternPointer &pattern, unsigned int occurrencethreshold=1, int maxskips=3)
Definition: patternmodel.h:1254
 
virtual unsigned int occurrencecount(const Pattern &pattern)=0
 
int MAXSKIPS
Maximum skips per skipgram. 
Definition: patternmodel.h:137
 
virtual bool has(const PatternPointer &pattern) const 
Definition: patternmodel.h:367
 
unsigned char model_type
Definition: patternmodel.h:301
 
void write(const std::string filename)
Definition: patternmodel.h:1299
 
Options for Pattern Model loading and training. 
Definition: patternmodel.h:111
 
std::pair< IndexReference, PatternPointer > IndexPattern
Definition: patternstore.h:39
 
int sentencelength(int sentence) const 
Definition: pattern.cpp:1806
 
int PRUNENONSUBSUMED
Definition: patternmodel.h:142
 
uint16_t token
Definition: datatypes.h:36
 
int ngrams(std::vector< Pattern > &container, const int n) const 
Definition: pattern.cpp:1050
 
std::set< int > cache_n
Definition: patternmodel.h:538
 
size_t size() const 
Definition: patternstore.h:800
 
iterator begin()
Definition: patternstore.h:810
 
int getmodeltype(const std::string &filename)
Definition: patternmodel.cpp:4
 
int getmodeltype() const 
Definition: patternmodel.h:2271
 
virtual void outputcooc(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:2180
 
int subngrams(std::vector< Pattern > &container, int minn=1, int maxn=99) const 
Definition: pattern.cpp:1120
 
Definition: patternmodel.h:76
 
void insert(IndexReference ref)
Definition: datatypes.h:106
 
Collection of references to position in the corpus (IndexReference). Used by Indexed Pattern models...
Definition: datatypes.h:86
 
virtual void print(std::ostream *out, ClassDecoder &decoder, const PatternType &pattern, bool endline=true)
Definition: patternmodel.h:1911
 
std::map< int, std::vector< uint32_t > > gapmasks
pre-computed masks representing possible gap configurations for various pattern lengths ...
Definition: patternmodel.h:545
 
bool reverseindex_internal
Definition: patternmodel.h:564
 
virtual t_relationmap getsubparents(const Pattern &pattern, int=0, int=0, int=0)
Definition: patternmodel.h:2170
 
virtual double frequency(const Pattern &)
Definition: patternmodel.h:479
 
PatternSetModel()
Definition: patternmodel.h:311
 
double coverage(const Pattern &key)
Definition: patternmodel.h:1397
 
PatternMap< double, BaseValueHandler< double >, uint64_t > t_relationmap_double
Definition: patternmodel.h:230
 
unsigned int bytesize() const 
Definition: patternstore.h:118
 
uint32_t mask
Definition: pattern.h:362
 
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_left(const IndexReference ref)
Definition: patternmodel.h:1507
 
virtual int computeskipgrams(const PatternPointer &pattern, int mintokens=2, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, std::vector< PatternPointer > *targetcontainer=NULL, const bool exhaustive=false, const int maxskips=3, const bool DEBUG=false)
Definition: patternmodel.h:1101
 
const PatternCategory category() const 
Definition: pattern.cpp:46
 
virtual void outputcooc_npmi(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:2179
 
virtual void train(std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:778
 
void add(const PatternPointer &patternpointer, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:3516
 
PatternPointer getsentence(int sentence) const 
Definition: pattern.cpp:1826
 
bool DOREMOVENGRAMS
Remove n-grams from the model upon loading it. 
Definition: patternmodel.h:145
 
std::set< int > cache_categories
Definition: patternmodel.h:537
 
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_bysentence(int sentence)
Definition: patternmodel.h:1471
 
void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:3507
 
void outputrelations(const PatternPointer &pattern, ClassDecoder &classdecoder, std::ostream *OUT, bool outputheader=true)
Definition: patternmodel.h:3101
 
void write(const std::string &filename)
Definition: patternmodel.h:455
 
virtual void train(const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:2329
 
PatternModelInterface * getinterface()
Definition: patternmodel.h:765
 
Definition: patternmodel.h:74
 
iterator begin()
Definition: patternstore.h:214
 
virtual unsigned int tokens() const 
Definition: patternmodel.h:505
 
unsigned char * data
Definition: pattern.h:360
 
int parts(std::vector< PatternPointer > &container) const 
Definition: pattern.cpp:1337
 
size_t size()
Definition: patternstore.h:261
 
Class for encoding plain-text to binary class-encoded data. 
 
void computenpmi(std::map< PatternPointer, t_relationmap_double > &coocmap, double threshold, bool right=true, bool left=true)
Definition: patternmodel.h:3141
 
const size_t n() const 
Definition: pattern.cpp:89
 
void info(std::ostream *OUT)
Definition: patternmodel.h:2340
 
int pruneskipgrams(int threshold, int minskiptypes, int _n=0)
Definition: patternmodel.h:2845
 
unsigned int coveragecount(const Pattern &key)
Definition: patternmodel.h:1389
 
An indexed model mapping patterns to values, high-level interface. This is a specialised subclass of ...
Definition: patternmodel.h:2192
 
unsigned int topthreshold(int amount, int category=0, int size=0)
Definition: patternmodel.h:1967
 
uint32_t reversemask(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:58
 
void histogram(std::ostream *OUT, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1988
 
virtual unsigned int occurrencecount(const PatternPointer &pattern)
Definition: patternmodel.h:1330
 
std::vector< IndexReference >::const_iterator const_iterator
Definition: datatypes.h:110
 
A pattern store in the form of an unordered set (i.e, no duplicates). Stores only patterns...
Definition: patternstore.h:538
 
void print(std::ostream *out, ClassDecoder &decoder, const PatternPointer &pattern, bool endline=true)
Definition: patternmodel.h:2421
 
uint64_t totaltypes
Total number of unigram/word types in the original corpus, SO INCLUDING NOT COVERED BY THE MODEL! ...
Definition: patternmodel.h:531
 
unsigned int prunenotinset(const std::unordered_set< Pattern > &s, int _n)
Definition: patternmodel.h:1784
 
virtual unsigned int types()
Definition: patternmodel.h:498
 
std::map< int, std::map< int, unsigned int > > cache_grouptotalwordtypes
total covered word types per group 
Definition: patternmodel.h:541
 
virtual void train(std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:2321
 
int masktailskip(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:77
 
PatternType
Definition: pattern.h:59
 
virtual int computeflexgrams_fromskipgrams()
Definition: patternmodel.h:2177
 
PatternModelOptions(const PatternModelOptions &ref)
Definition: patternmodel.h:188
 
Definition: patternmodel.h:97
 
unsigned int prunebymodel(PatternModel< ValueType2, ValueHandler2, MapType2 > &secondmodel)
Definition: patternmodel.h:1811
 
int MINLENGTH
The minimum length of patterns to be loaded/extracted (in words/tokens) (default: 1) ...
Definition: patternmodel.h:125
 
void write(std::ostream *out)
Definition: patternmodel.h:1279
 
virtual unsigned int types()
Definition: patternmodel.h:1368
 
virtual void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:3408
 
int maxn
Definition: patternmodel.h:305
 
Pattern pattern() const 
Definition: pattern.h:527
 
virtual size_t size() const 
Definition: patternmodel.h:662
 
int minn
Definition: patternmodel.h:306
 
virtual unsigned int occurrencecount(const Pattern &pattern)
Definition: patternmodel.h:473
 
virtual void load(std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:700
 
unsigned char getdataversion(std::istream *in)
Definition: classdecoder.cpp:257
 
void insert(const Pattern &pattern)
Definition: patternstore.h:580
 
Measurement begin(const string &title)
Definition: benchmarks.cpp:148
 
iterator begin()
Definition: datatypes.h:112
 
int parts(std::vector< Pattern > &container) const 
Definition: pattern.cpp:1225
 
PatternModelOptions()
Definition: patternmodel.h:157
 
virtual void print(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1854
 
virtual unsigned int tokens() const 
Definition: patternmodel.h:1376
 
unsigned char model_version
Definition: patternmodel.h:302
 
uint32_t sentence
Definition: datatypes.h:35
 
std::map< int, std::map< int, unsigned int > > cache_grouptotalpatterns
total distinct patterns per group 
Definition: patternmodel.h:540
 
bool has(const Pattern &pattern) const 
Definition: patternstore.h:587
 
virtual PatternStoreInterface * getstoreinterface()
Definition: patternmodel.h:288
 
std::vector< std::pair< Pattern, int > > getpatterns(const Pattern &pattern)
Definition: patternmodel.h:1834
 
PatternMap< double, BaseValueHandler< double >, uint64_t >::iterator t_relationmap_double_iterator
Definition: patternmodel.h:233
 
virtual t_relationmap getrightneighbours(const Pattern &pattern, int=0, int=0, int=0, int=0)
Definition: patternmodel.h:2175
 
virtual void trainskipgrams(PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL)
Definition: patternmodel.h:2453
 
bool hasskipgrams
Does this model have skipgrams? 
Definition: patternmodel.h:565
 
Definition: patternmodel.h:87
 
int flexgramsize(const Pattern &pattern, IndexReference begin)
Definition: patternmodel.h:3300
 
std::map< int, std::map< int, unsigned int > > cache_grouptotaltokens
total covered tokens per group 
Definition: patternmodel.h:542
 
std::string tostring(const ClassDecoder &classdecoder) const 
Definition: pattern.cpp:283
 
virtual void add(const PatternPointer &patternpointer, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:2288
 
virtual void postread(const PatternModelOptions options)
Definition: patternmodel.h:547
 
virtual void add(const PatternPointer &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:3428
 
virtual size_t size() const 
Definition: patternmodel.h:361
 
vector< uint32_t > compute_skip_configurations(const int n, const int maxskips)
Definition: algorithms.cpp:85
 
unsigned int totaloccurrencesingroup(int category, int n)
Definition: patternmodel.h:1623
 
int computeflexgrams_fromcooc(double threshold)
Definition: patternmodel.h:3217
 
virtual void add(const Pattern &pattern, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:2282
 
std::string tostring() const 
Definition: datatypes.h:72
 
PatternSetModel(std::istream *f, PatternModelOptions options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:325
 
virtual void outputrelations(const Pattern &pattern, ClassDecoder &classdecoder, std::ostream *OUT)
Definition: patternmodel.h:2168
 
t_relationmap getrightcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, IndexedData *matches=NULL)
Definition: patternmodel.h:2939
 
virtual unsigned int pruneskipgrams(unsigned int threshold, int minskiptypes=2, int _n=0)
Definition: patternmodel.h:1758