66 #include "bz2stream.h"
98 virtual const char* what()
const throw()
100 return "Pattern not found in model";
159 MINTOKENS_SKIPGRAMS = -1;
160 MINTOKENS_UNIGRAMS = 1;
163 MAXBACKOFFLENGTH = 100;
168 DOSKIPGRAMS_EXHAUSTIVE =
false;
170 DOREVERSEINDEX =
true;
171 DOPATTERNPERLINE =
false;
174 DOREMOVEINDEX =
false;
175 DOREMOVENGRAMS =
false;
176 DOREMOVESKIPGRAMS =
false;
177 DOREMOVEFLEXGRAMS =
false;
179 PRUNENONSUBSUMED =
false;
280 virtual unsigned int types() =0;
286 virtual unsigned int tokens()
const=0;
332 this->
load(f,options, constrainmodel);
348 if (!options.
QUIET) std::cerr <<
"Loading " << filename << std::endl;
349 std::ifstream * in =
new std::ifstream(filename.c_str());
351 std::cerr <<
"ERROR: Unable to load file " << filename << std::endl;
354 this->
load( (std::istream *) in, options, constrainmodel);
378 if (!options.
QUIET) std::cerr <<
"Loading " << filename <<
" as set-model" << std::endl;
379 std::ifstream * in =
new std::ifstream(filename.c_str());
381 std::cerr <<
"ERROR: Unable to load file " << filename << std::endl;
384 this->
load( (std::istream *) in, options, constrainmodel);
396 f->read( (
char*) &null,
sizeof(
char));
397 f->read( (
char*) &model_type,
sizeof(
char));
398 f->read( (
char*) &model_version,
sizeof(
char));
401 std::cerr <<
"ERROR: File is not a colibri patternmodel file" << std::endl;
404 if (model_version > 2) {
405 std::cerr <<
"WARNING: Model is created with a newer version of Colibri Core! Attempting to continue but failure is likely..." << std::endl;
407 f->read( (
char*) &totaltokens,
sizeof(uint64_t));
408 f->read( (
char*) &totaltypes,
sizeof(uint64_t));
411 if (constrainmodel) constrainstore = constrainmodel->getstoreinterface();
414 std::cerr <<
"Debug enabled, loading PatternModel type " << (int) model_type <<
", version " << (
int) model_version <<
", classencodingversion" << (int) this->
classencodingversion << std::endl;
415 std::cerr <<
"Total tokens: " << totaltokens <<
", total types: " << totaltypes << std::endl;;
430 std::cerr <<
"ERROR: Unknown model type" << std::endl;
440 out->write( (
char*) &null,
sizeof(
char));
442 out->write( (
char*) &t,
sizeof(
char));
444 out->write( (
char*) &v,
sizeof(
char));
445 out->write( (
char*) &totaltokens,
sizeof(uint64_t));
446 const uint64_t tp = this->
types();
447 out->write( (
char*) &tp,
sizeof(uint64_t));
455 void write(
const std::string & filename) {
456 std::ofstream * out =
new std::ofstream(filename.c_str());
525 template<
class ValueType,
class ValueHandler = BaseValueHandler<ValueType>,
class MapType = PatternMap<ValueType, BaseValueHandler<ValueType>>,
class PatternType = Pattern>
553 if (n > maxn) maxn = n;
554 if (n < minn) minn = n;
575 hasskipgrams =
false;
579 this->reverseindex = corpus;
580 this->attachcorpus(*corpus);
582 this->reverseindex = NULL;
584 reverseindex_internal =
false;
599 hasskipgrams =
false;
602 this->
load(f,options,constrainmodel);
604 this->reverseindex = corpus;
605 this->attachcorpus(*corpus);
607 this->reverseindex = NULL;
609 reverseindex_internal =
false;
613 if (reverseindex_internal && reverseindex != NULL)
delete reverseindex;
628 hasskipgrams =
false;
632 this->reverseindex = corpus;
633 this->attachcorpus(*corpus);
635 this->reverseindex = NULL;
637 reverseindex_internal =
false;
638 if (!options.QUIET) std::cerr <<
"Loading " << filename << std::endl;
639 std::ifstream * in =
new std::ifstream(filename.c_str());
641 std::cerr <<
"ERROR: Unable to load file " << filename << std::endl;
644 this->
load( (std::istream *) in, options, constrainmodel);
663 return MapType::size();
670 return MapType::has(pattern);
673 return MapType::has(pattern);
683 if (!options.
QUIET) std::cerr <<
"Loading " << filename << std::endl;
684 std::ifstream * in =
new std::ifstream(filename.c_str());
686 std::cerr <<
"ERROR: Unable to load file " << filename << std::endl;
689 this->
load( (std::istream *) in, options, constrainmodel);
702 f->read( (
char*) &null,
sizeof(
char));
703 f->read( (
char*) &model_type,
sizeof(
char));
704 f->read( (
char*) &model_version,
sizeof(
char));
705 if (model_version == 1) this->classencodingversion = 1;
707 std::cerr <<
"File is not a colibri model file (or a very old one)" << std::endl;
710 if (model_version > 2) {
711 std::cerr <<
"WARNING: Model is created with a newer version of Colibri Core! Attempting to continue but failure is likely..." << std::endl;
714 std::cerr <<
"Debug enabled, loading PatternModel type " << (int) model_type <<
", version " << (
int) model_version <<
", classencodingversion=" << (int) this->classencodingversion << std::endl;
718 if (options.
DEBUG) std::cerr <<
"Reading corpus data" << std::endl;
719 unsigned int corpussize;
720 f->read( (
char*) &corpussize,
sizeof(
unsigned int));
721 unsigned char * corpusdata =
new unsigned char[corpussize];
722 f->read((
char*) corpusdata,
sizeof(
unsigned char) * corpussize);
724 this->attachcorpus(*reverseindex);
725 reverseindex_internal =
true;
726 if (options.
DEBUG) std::cerr <<
"(read " << corpussize <<
" bytes)" << std::endl;
728 f->read( (
char*) &totaltokens,
sizeof(uint64_t));
729 f->read( (
char*) &totaltypes,
sizeof(uint64_t));
735 std::cerr <<
"Total tokens: " << totaltokens <<
", total types: " << totaltypes << std::endl;;
741 MapType::template read<IndexedData,IndexedDataHandler,PatternType>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore, !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS, options.
DORESET, options.
DEBUG);
744 MapType::template read<uint32_t,BaseValueHandler<uint32_t>,
PatternType>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore, !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS, options.
DORESET, options.
DEBUG);
747 MapType::template read<uint32_t,BaseValueHandler<uint32_t>,
PatternPointer>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore, !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS, options.
DORESET, options.
DEBUG);
750 MapType::template read<IndexedData,IndexedDataHandler,PatternPointer>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore, !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS, options.
DORESET, options.
DEBUG);
755 MapType::template read<PatternFeatureVectorMap<double>,
PatternFeatureVectorMapHandler<double>,
PatternType>(f, options.
MINTOKENS, options.
MINLENGTH,options.
MAXLENGTH, constrainstore, !options.
DOREMOVENGRAMS, !options.
DOREMOVESKIPGRAMS, !options.
DOREMOVEFLEXGRAMS,options.
DORESET, options.
DEBUG);
782 if (constrainbymodel ==
this) {
785 }
else if (constrainbymodel != NULL) {
786 totaltypes = constrainbymodel->types();
787 totaltokens = constrainbymodel->tokens();
789 uint32_t sentence = firstsentence-1;
792 bool iter_unigramsonly =
false;
793 bool skipunigrams =
false;
795 iter_unigramsonly =
true;
798 if (!options.
QUIET) {
799 std::cerr <<
"Training patternmodel";
800 if (constrainbymodel != NULL) std::cerr <<
", constrained by another model";
801 std::cerr <<
", occurrence threshold: " << options.
MINTOKENS;
802 if (iter_unigramsonly) std::cerr <<
", secondary word occurrence threshold: " << options.
MINTOKENS_UNIGRAMS;
803 if (version < 2) std::cerr <<
", class encoding version: " << (int) version;
804 std::cerr << std::endl;
806 std::vector<std::pair<PatternPointer,int> > ngrams;
807 std::vector<PatternPointer> subngrams;
810 int prevsize = this->
size();
811 if (constrainbymodel ==
this) prevsize = 0;
815 if (!this->data.empty()) {
816 if ((continued) && (!options.
QUIET)) std::cerr <<
"Continuing training on preloaded model, computing statistics..." << std::endl;
820 for (
int n = 1; n <= options.
MAXLENGTH; n++) {
821 bool skipgramsonly =
false;
823 if ((options.
MINTOKENS > 1) && (constrainbymodel == NULL)) {
824 if (cache_grouptotal[
NGRAM][n] > 0) {
828 if (!options.
QUIET) std::cerr <<
"Skipping " << n <<
"-grams, already in model" << std::endl;
835 int foundskipgrams = 0;
844 if (!options.
QUIET) {
845 if (iter_unigramsonly) {
846 std::cerr <<
"Counting unigrams using secondary word occurrence threshold (" << options.
MINTOKENS_UNIGRAMS <<
")" << std::endl;
848 std::cerr <<
"Counting patterns from list, one per line" << std::endl;
849 }
else if (constrainbymodel != NULL) {
850 std::cerr <<
"Counting n-grams that occur in constraint model" << std::endl;
852 std::cerr <<
"Counting " << n <<
"-grams" << std::endl;
853 if (skipgramsonly) std::cerr <<
"(only counting skipgrams actually, n-grams already counted earlier)" << std::endl;
855 std::cerr <<
"Counting *all* n-grams (occurrence threshold=1)" << std::endl;
863 sentence = firstsentence-1;
864 bool singlepass =
false;
865 const unsigned int sentences = (reverseindex != NULL) ? reverseindex->
sentences() : 0;
866 while (((reverseindex != NULL) && (sentence < sentences)) || ((reverseindex == NULL) && (in != NULL) && (!in->eof()))) {
869 if (linepattern != NULL)
delete linepattern;
870 if (reverseindex == NULL) linepattern =
new Pattern(in,
false,version);
873 const unsigned int linesize = line.
n();
874 if (options.
DEBUG) std::cerr <<
"Processing line " << sentence <<
", size (tokens) " << linesize <<
" (bytes) " << line.
bytesize() <<
", n=" << n << std::endl;
880 if ((n==1) && (!continued)) totaltokens += linesize;
885 if (linesize > (
unsigned int) options.
MAXLENGTH)
continue;
886 ngrams.push_back(std::pair<PatternPointer,int>(line,0));
888 if (iter_unigramsonly) {
890 }
else if ((options.
MINTOKENS > 1) && (constrainbymodel == NULL)) {
895 if (continued) minlength = this->maxn + 1;
899 if (options.
DEBUG) std::cerr <<
"\t" << ngrams.size() <<
" ngrams in line" << std::endl;
903 for (std::vector<std::pair<PatternPointer,int>>::
iterator iter = ngrams.begin(); iter != ngrams.end(); iter++) {
906 if ((singlepass) && (options.
MINLENGTH == 1) && (skipunigrams) && (iter->first.n() == 1)) {
911 if (!skipgramsonly) {
913 if ((constrainbymodel != NULL) && (!iter_unigramsonly) && (!constrainbymodel->has(iter->first)))
continue;
921 iter->first.ngrams(subngrams,1);
922 for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
938 iter->first.ngrams(subngrams, backoffn);
939 for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
940 if (!this->
has(*iter2)) {
949 if ((found) && (!skipgramsonly)) {
950 if (options.
DEBUG) std::cerr <<
"\t\tAdding @" << ref.
sentence <<
":" << ref.
token <<
" n=" << iter->first.n() <<
" category=" <<(int) iter->first.category()<< std::endl;
951 add(iter->first, ref);
954 if (((n >= 3) || (options.
MINTOKENS == 1))
956 int foundskipgrams_thisround = this->
computeskipgrams(iter->first, options, &ref, NULL, constrainbymodel,
true );
957 if (foundskipgrams_thisround > 0) hasskipgrams =
true;
958 foundskipgrams += foundskipgrams_thisround;
960 }
catch (std::exception &e) {
961 std::cerr <<
"ERROR: An internal error has occured during training!!!" << std::endl;
962 if (ignoreerrors)
continue;
969 if (!iter_unigramsonly) {
970 foundngrams = this->
size() - foundskipgrams - prevsize;
972 if ((foundngrams) || (foundskipgrams)) {
973 if (n > this->maxn) this->maxn = n;
974 if (n < this->minn) this->minn = n;
976 if (!options.
QUIET) std::cerr <<
"None found" << std::endl;
977 if (!continued)
break;
979 if (!options.
QUIET) std::cerr <<
" Found " << foundngrams <<
" ngrams...";
981 if ((!continued) && ((constrainbymodel == NULL) or (constrainbymodel ==
this))) {
982 if ((options.
MINTOKENS > 1) && (n == 1)) {
983 totaltypes = this->
size();
985 if (!options.
QUIET) std::cerr <<
" computing total word types prior to pruning...";
987 if (!options.
QUIET) std::cerr << totaltypes <<
"...";
1000 this->
prune(-1, n-1);
1001 if (!options.
QUIET) std::cerr <<
" (pruned last iteration due to minimum length)" << pruned;
1004 if (!options.
QUIET) std::cerr <<
"pruned " << pruned;
1005 if (foundskipgrams) {
1006 unsigned int prunedextra;
1007 if ((options.
MINTOKENS == 1) || (constrainbymodel != NULL)) {
1012 if (prunedextra && !options.
QUIET) std::cerr <<
" plus " << prunedextra <<
" extra skipgrams..";
1013 pruned += prunedextra;
1015 if (!options.
QUIET) std::cerr <<
"...total kept: " << (foundngrams + foundskipgrams) - pruned << std::endl;
1016 if (((options.
MINTOKENS == 1) || (constrainbymodel != NULL)))
break;
1018 if (!options.
QUIET) std::cerr <<
"found " << this->
size() << std::endl;
1020 if ((!continued) && ((constrainbymodel == NULL) or (constrainbymodel ==
this))) {
1021 if (!options.
QUIET) std::cerr <<
" computing total word types prior to pruning...";
1022 totaltypes = this->
size();
1023 if (!options.
QUIET) std::cerr << totaltypes <<
"...";
1028 iter_unigramsonly =
false;
1029 if ((n == 1) && (options.
MINLENGTH ==1)) skipunigrams =
true;
1033 prevsize = this->
size();
1050 if (!options.
QUIET) std::cerr <<
"Pruning non-subsumed n-grams" << std::endl;
1053 for (
int n = begin_n; n > 1; n--) {
1054 std::unordered_set<Pattern> subsumed;
1055 unsigned int prunednonsubsumed = 0;
1057 while (iter != this->
end()) {
1058 const unsigned int pattern_n = iter->first.n();
1059 if (pattern_n == (
unsigned int) n) {
1061 iter->first.ngrams(subngrams, n-1);
1062 for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) subsumed.insert(
Pattern(*iter2));
1067 if (!options.
QUIET) std::cerr <<
" pruned " << prunednonsubsumed <<
" non-subsumed " << (n-1) <<
"-grams" << std::endl;
1071 if (linepattern != NULL)
delete linepattern;
1082 if ((filename.size() > 3) && (filename.substr(filename.size()-3) ==
".bz2")) {
1083 std::ifstream * in =
new std::ifstream(filename.c_str(), std::ios::in|std::ios::binary);
1084 bz2istream * decompressor =
new bz2istream(in->rdbuf());
1085 this->
train( (std::istream*) decompressor, options, constrainbymodel, continued, firstsentence, ignoreerrors);
1086 delete decompressor;
1089 std::ifstream * in =
new std::ifstream(filename.c_str());
1090 this->
train((std::istream*) in, options, constrainbymodel, continued, firstsentence, ignoreerrors);
1106 if (mintokens == -1) mintokens = 2;;
1107 if (mintokens <= 1) {
1111 int foundskipgrams = 0;
1112 const int n = pattern.
n();
1113 std::vector<PatternPointer> subngrams;
1119 for (std::vector<uint32_t>::iterator iter2 = gapmasks[n].
begin(); iter2 != gapmasks[n].end(); iter2++, gapconf_i++) {
1120 if (*iter2 == 0)
continue;
1125 skipgram.
mask = *iter2;
1128 std::cerr <<
"Checking for: " << std::endl;
1132 if ((constrainbymodel != NULL) && (!constrainbymodel->has(skipgram)))
continue;
1135 if ((
int) skipgram.
n() != n) {
1136 std::cerr <<
"Generated invalid skipgram, n=" << skipgram.
n() <<
", expected " << n << std::endl;
1141 bool skipgram_valid =
true;
1142 if ((mintokens != 1) && (constrainbymodel == NULL)) {
1143 bool check_extra =
false;
1146 skipgram.
ngrams(subngrams,n-1);
1147 for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
1149 if (!subpattern.
isgap(0) && !subpattern.
isgap(subpattern.
n() - 1)) {
1154 std::cerr <<
"Subpattern: " << std::endl;
1157 if (!this->
has(subpattern)) {
1158 if (DEBUG) std::cerr <<
" discarded" << std::endl;
1159 skipgram_valid =
false;
1170 if (!skipgram_valid)
continue;
1178 std::vector<PatternPointer> parts;
1179 skipgram.
parts(parts);
1180 for (std::vector<PatternPointer>::iterator iter3 = parts.begin(); iter3 != parts.end(); iter3++) {
1182 if (!this->
has(part)) {
1183 skipgram_valid =
false;
1187 if (!skipgram_valid)
continue;
1192 const std::vector<std::pair<int,int>> gapconfiguration =
mask2vector(skipgram.
mask, n);
1193 for (std::vector<std::pair<int,int>>::
const_iterator iter3 = gapconfiguration.begin(); iter3 != gapconfiguration.end(); iter3++) {
1194 if (!((iter3->first - 1 == 0) && (iter3->first + iter3->second + 1 == n))) {
1197 std::cerr <<
"Subskipgram: " << std::endl;
1200 if (!this->
has(subskipgram)) {
1201 if (DEBUG) std::cerr <<
" discarded" << std::endl;
1202 skipgram_valid =
false;
1211 if (skipgram_valid) {
1212 if (DEBUG) std::cerr <<
" counted!" << std::endl;
1213 if (targetcontainer == NULL) {
1215 if (!
has(skipgram)) foundskipgrams++;
1216 if (singleref != NULL) {
1217 add(skipgram, *singleref );
1218 }
else if (multiplerefs != NULL) {
1221 add(skipgram, ref );
1224 std::cerr <<
"ERROR: computeskipgrams() called with no singleref and no multiplerefs" << std::endl;
1230 targetcontainer->push_back(skipgram);
1236 std::cerr <<
"IGNORING ERROR and continuing with next skipgram" << std::endl;
1239 return foundskipgrams;
1257 std::vector<PatternPointer> skipgrams;
1267 std::cerr <<
"Can not compute skipgrams on unindexed model (except exhaustively during train() )" << std::endl;
1274 void test(MapType & target, std::istream * in);
1280 const char null = 0;
1281 out->write( (
char*) &null,
sizeof(
char));
1283 out->write( (
char*) &t,
sizeof(
char));
1285 out->write( (
char*) &v,
sizeof(
char));
1287 out->write( (
char*) &this->corpussize,
sizeof(
unsigned int));
1288 out->write((
char*) this->corpusstart,
sizeof(
unsigned char) * this->corpussize);
1290 out->write( (
char*) &totaltokens,
sizeof(uint64_t));
1291 const uint64_t tp = this->
types();
1292 out->write( (
char*) &tp,
sizeof(uint64_t));
1293 MapType::write(out);
1299 void write(
const std::string filename) {
1300 std::ofstream * out =
new std::ofstream(filename.c_str());
1322 ValueType * data = this->
getdata(pattern);
1324 return this->valuehandler.count(*data);
1331 ValueType * data = this->
getdata(pattern);
1333 return this->valuehandler.count(*data);
1344 typename MapType::iterator iter = this->find(pattern);
1345 if (iter != this->
end()) {
1346 return &(iter->second);
1347 }
else if (makeifnew) {
1348 return &((*this)[pattern]);
1355 typename MapType::iterator iter = this->find(pattern);
1356 if (iter != this->
end()) {
1357 return &(iter->second);
1358 }
else if (makeifnew) {
1359 return &((*this)[pattern]);
1381 void output(std::ostream *);
1410 std::vector<PatternPointer> result;
1411 if (!this->reverseindex)
return result;
1414 const unsigned int minn = this->
minlength();
1415 const unsigned int maxn = this->
maxlength();
1416 for (
unsigned int n = minn; ref.
token + n <= sl && n <=
maxn; n++) {
1425 && ((category == 0) || (ngram.
category() >= category)) ) {
1426 result.push_back(ngram);
1428 if (((category == 0) || (category ==
SKIPGRAM)) && (this->hasskipgrams)) {
1434 std::vector<PatternPointer> skipgrams = this->
findskipgrams(ngram, occurrencecount);
1435 for (
auto skipgram : skipgrams) {
1436 result.push_back(skipgram);
1473 std::vector<std::pair<IndexReference,PatternPointer>> result;
1474 for (
int i = 0; i < this->reverseindex->
sentencelength(sentence); i++) {
1477 for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
1479 result.push_back(std::pair<IndexReference,PatternPointer>(ref,pattern));
1491 std::vector<std::pair<IndexReference,PatternPointer>> result;
1495 for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
1497 result.push_back(std::pair<IndexReference,PatternPointer>(ref2,pattern));
1509 std::vector<std::pair<IndexReference,PatternPointer>> result;
1510 for (
int i = 0; i < ref.
token; i++) {
1513 for (std::vector<PatternPointer>::iterator iter = tmpresult.begin(); iter != tmpresult.end(); iter++) {
1515 result.push_back(std::pair<IndexReference,PatternPointer>(ref2,pattern));
1527 cache_categories.clear();
1529 cache_grouptotal.clear();
1530 cache_grouptotalpatterns.clear();
1531 cache_categories.insert(0);
1534 while (iter != this->
end()) {
1536 const int c = pattern.category();
1537 cache_categories.insert(c);
1538 const int n = pattern.n();
1544 cache_grouptotal[c][n] += this->valuehandler.count(iter->second);
1545 cache_grouptotal[0][n] += this->valuehandler.count(iter->second);
1546 cache_grouptotalpatterns[c][n]++;
1547 cache_grouptotalpatterns[0][n]++;
1549 cache_grouptotal[c][0] += this->valuehandler.count(iter->second);
1550 cache_grouptotal[0][0] += this->valuehandler.count(iter->second);
1553 cache_grouptotalpatterns[c][0]++;
1554 cache_grouptotalpatterns[0][0]++;
1560 cache_grouptotalwordtypes.clear();
1561 cache_grouptotaltokens.clear();
1570 if ((cache_grouptotal.empty()) && (!this->data.empty())) this->
computestats();
1575 for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {
1576 if ((category == 0) || (*iterc == category)) {
1577 for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {
1578 if (((n == 0) || (*itern == n)) && (cache_grouptotalwordtypes[*iterc][*itern] == 0) ) {
1579 std::unordered_set<Pattern>
types;
1581 while (iter != this->
end()) {
1583 const int pn = (int) pattern.n();
1584 if ( (pn == 1) && (*itern <= 1) && ((*iterc == 0) || (pattern.category() == *iterc))) {
1585 types.insert(pattern);
1587 if (((*itern == 0) || (pn == *itern)) && ((*iterc == 0) || (pattern.category() == *iterc))) {
1588 std::vector<PatternType> unigrams;
1589 pattern.ngrams(unigrams, 1);
1590 for (
typename std::vector<PatternType>::iterator iter2 = unigrams.begin(); iter2 != unigrams.end(); iter2++) {
1596 cache_grouptotaltokens[*iterc][*itern] += this->valuehandler.count(iter->second);
1599 cache_grouptotalwordtypes[*iterc][*itern] += types.size();
1625 if ((cache_grouptotal.empty()) && (!this->data.empty())) this->
computestats();
1626 return cache_grouptotal[category][n];
1636 if ((cache_grouptotalpatterns.empty()) && (!this->data.empty())) this->
computestats();
1637 return cache_grouptotalpatterns[category][n];
1648 if ((cache_grouptotalwordtypes.empty()) && (!this->data.empty())) this->
computecoveragestats(category,n);
1649 return cache_grouptotalwordtypes[category][n];
1659 if ((cache_grouptotaltokens.empty()) && (!this->data.empty())) this->
computecoveragestats(category,n);
1660 return cache_grouptotaltokens[category][n];
1692 ValueType * data =
getdata(pattern,
true);
1693 this->
add(pattern, data, ref );
1705 if (value == NULL) {
1706 std::cerr <<
"Add() value is NULL!" << std::endl;
1709 this->valuehandler.add(value, ref);
1712 if (value == NULL) {
1713 std::cerr <<
"Add() value is NULL!" << std::endl;
1716 this->valuehandler.add(value, ref);
1728 unsigned int prune(
int threshold,
int _n=0) {
1731 unsigned int pruned = 0;
1733 while (iter != this->
end()) {
1735 if (( (_n == 0) || (pattern.n() == (
unsigned int) _n) )&& ((threshold == -1) || (
occurrencecount(pattern) < (
unsigned int) threshold))) {
1739 iter = this->erase(iter);
1758 virtual unsigned int pruneskipgrams(
unsigned int threshold,
int minskiptypes=2,
int _n = 0) {
1760 unsigned int pruned = 0;
1761 if (minskiptypes <=1)
return pruned;
1764 while(iter != this->
end()) {
1766 if (( (_n == 0) || ((
int) pattern.n() == _n) ) && (pattern.category() ==
SKIPGRAM)) {
1768 iter = this->erase(iter);
1785 unsigned int pruned = 0;
1790 while (iter != this->
end()) {
1792 if ( (_n == 0) || (pattern.n() == (
unsigned int) _n) ) {
1793 if (s.find(pattern) == s.end()) {
1795 iter = this->erase(iter);
1810 template<
class ValueType2,
class ValueHandler2,
class MapType2>
1816 unsigned int pruned = 0;
1818 while(iter != this->
end()) {
1820 if (!secondmodel.
has(pattern)) {
1821 iter = this->erase(iter);
1836 std::vector<std::pair<Pattern, int> > v;
1837 std::vector<std::pair<Pattern, int> > ngrams;
1839 for (std::vector<std::pair<Pattern, int> >::iterator iter = ngrams.begin(); iter != ngrams.end(); iter++) {
1840 const Pattern p = iter->first;
1841 if (this->
has(p)) v.push_back(*iter);
1855 bool haveoutput =
false;
1858 *out <<
"PATTERN\tCOUNT\tTOKENS\tCOVERAGE\tCATEGORY\tSIZE\tFREQUENCY" << std::endl;
1862 this->
print(out, decoder, pattern,
true);
1865 std::cerr << std::endl <<
"Legend:" << std::endl;
1866 std::cerr <<
" - PATTERN : The pattern, Gaps in skipgrams are represented as {*}. Variable-width gaps in flexgrams are shown using {**}." << std::endl;
1867 std::cerr <<
" - COUNT : The occurrence count - the amount of times the pattern occurs in the data" << std::endl;
1868 std::cerr <<
" - TOKENS : The maximum number of tokens in the corpus that this pattern covers. *THIS IS JUST A MAXIMUM PROJECTION* rather than an exact number because your model is not indexed" << std::endl;
1869 std::cerr <<
" - COVERAGE : The maximum number of tokens covered, as a fraction of the total in the corpus (projection)" << std::endl;
1870 std::cerr <<
" - CATEGORY : The pattern type category (ngram,skipgram,flexgram)" << std::endl;
1871 std::cerr <<
" - SIZE : The size of the pattern (in tokens)" << std::endl;
1872 std::cerr <<
" - FREQUENCY : The frequency of the pattern *within it's pattern type category and size-class*." << std::endl;
1873 std::cerr <<
" - REFERENCES : A space-delimited list of sentence:token position where the pattern occurs in the data. Sentences start at 1, tokens at 0" << std::endl;
1884 if (!this->reverseindex)
return;
1889 for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
1891 *out <<
"\t" << p.
tostring(decoder);
1903 this->
print(out, decoder);
1912 const std::string pattern_s = pattern.tostring(decoder);
1916 const double freq = this->
frequency(pattern);
1917 const int cat = pattern.category();
1921 }
else if (cat == 2) {
1923 }
else if (cat == 3) {
1926 *out << pattern_s <<
"\t" << count <<
"\t" <<
"\t" << covcount <<
"\t" << coverage <<
"\t" << cat_s <<
"\t" << pattern.size() <<
"\t" << freq;
1927 if (endline) *out << std::endl;
1936 return this->
print(out,decoder,pattern,endline);
1948 void histogram(std::map<unsigned int,unsigned int> & hist,
unsigned int threshold = 0,
unsigned int cap = 0,
int category = 0,
unsigned int size = 0) {
1951 if (((category != 0) && (pattern.category() != category)) || ((
size != 0) && (
size != pattern.size())))
continue;
1953 if (c >= threshold) hist[c]++;
1956 unsigned int sum = 0;
1957 std::map<unsigned int,unsigned int>::reverse_iterator iter = hist.rbegin();
1958 while ((sum < cap) && (iter != hist.rend())) {
1960 sum += iter->second;
1963 hist.erase(iter.base(), hist.end());
1969 std::map<unsigned int,unsigned int> hist;
1971 std::map<unsigned int,unsigned int>::reverse_iterator iter = hist.rbegin();
1972 if (iter != hist.rend()) {
1988 void histogram(std::ostream * OUT,
unsigned int threshold = 0,
unsigned int cap = 0 ,
int category = 0,
unsigned int size = 0) {
1989 std::map<unsigned int,unsigned int> hist;
1991 *OUT <<
"HISTOGRAM" << std::endl;
1992 *OUT <<
"------------------------------" << std::endl;
1993 *OUT <<
"OCCURRENCES\tPATTERNS" << std::endl;
1994 for (std::map<unsigned int,unsigned int>::iterator iter = hist.begin(); iter != hist.end(); iter++) {
1995 *OUT << iter->first <<
"\t" << iter->second << std::endl;
2004 *OUT <<
"Type: indexed" << std::endl;
2006 *OUT <<
"Type: unindexed" << std::endl;
2009 *OUT <<
"Type: unknown" << std::endl;
2011 *OUT <<
"Total tokens: " << this->totaltokens << std::endl;
2012 *OUT <<
"Total word types: " << this->totaltypes << std::endl;
2013 *OUT <<
"Types patterns loaded: " << this->
size() << std::endl;
2014 *OUT <<
"Min n: " << this->minn << std::endl;
2015 *OUT <<
"Max n: " << this->maxn << std::endl;
2016 if (this->reverseindex) {
2017 *OUT <<
"Reverse index: yes" << std::endl;
2018 *OUT <<
"References in reverse index: " << this->reverseindex->
size() << std::endl;
2020 *OUT <<
"Reverse index: no" << std::endl;
2022 *OUT <<
"Size of Pattern: " <<
sizeof(
Pattern) <<
" byte" << std::endl;
2023 *OUT <<
"Size of ValueType: " <<
sizeof(ValueType) <<
" byte" << std::endl;
2024 unsigned int totalkeybs = 0;
2025 unsigned int totalvaluebs = 0;
2028 totalkeybs +=
sizeof(
PatternType) + pattern.bytesize();
2029 totalvaluebs +=
sizeof(ValueType);
2031 *OUT <<
"Total key bytesize (patterns): " << totalkeybs <<
" bytes (" << (totalkeybs/1024/1024) <<
" MB)" << std::endl;
2032 *OUT <<
"Total value bytesize (counts/index): " << totalvaluebs <<
" bytes (" << (totalvaluebs/1024/1024) <<
" MB)" << std::endl;
2033 *OUT <<
"Mean key bytesize: " << (totalkeybs / (float) this->
size()) << std::endl;
2034 *OUT <<
"Mean value bytesize: " << (totalvaluebs / (float) this->
size()) << std::endl;
2036 unsigned int ri_totalkeybs = 0;
2037 unsigned int ri_totalvaluebs = 0;
2038 if (this->reverseindex) {
2040 ri_totalkeybs +=
sizeof(iter->first.sentence) +
sizeof(iter->first.token);
2043 *OUT <<
"Total key bytesize in reverse index (references): " << ri_totalkeybs <<
" bytes (" << (ri_totalkeybs/1024/1024) <<
" MB)" << std::endl;
2044 *OUT <<
"Total value bytesize in reverse index (patterns): " << ri_totalvaluebs <<
" bytes (" << (ri_totalvaluebs/1024/1024) <<
" MB)" << std::endl;
2048 const unsigned int t = (totalkeybs + totalvaluebs + ri_totalkeybs + ri_totalvaluebs);
2049 *OUT <<
"Total bytesize (without overhead): " << t <<
" bytes (" << (t/1024/1024) <<
" MB)" << std::endl;
2057 if ((cache_grouptotaltokens.empty()) && (!this->data.empty())) {
2058 std::cerr <<
"Computing statistics..." << std::endl;
2061 *OUT << std::setiosflags(std::ios::fixed) << std::setprecision(4) << std::endl;
2062 *OUT <<
"REPORT" << std::endl;
2064 *OUT <<
" Warning: Model is unindexed, token coverage counts are mere maximal projections" << std::endl;
2065 *OUT <<
" assuming no overlap at all!!! Use an indexed model for accurate coverage counts" << std::endl;
2067 *OUT <<
"----------------------------------" << std::endl;
2068 *OUT <<
" " << std::setw(15) <<
"PATTERNS" << std::setw(15) <<
"TOKENS" << std::setw(15) <<
"COVERAGE" << std::setw(15) <<
"TYPES" << std::setw(15) << std::endl;
2069 *OUT <<
"Total: " << std::setw(15) <<
"-" << std::setw(15) << this->
tokens() << std::setw(15) <<
"-" << std::setw(15) << this->
types() << std::endl;
2074 if (coveredtokens > this->
tokens()) coveredtokens = this->
tokens();
2075 unsigned int uncoveredtokens = this->
tokens() - coveredtokens;
2076 if (uncoveredtokens < 0) uncoveredtokens = 0;
2077 *OUT <<
"Uncovered: " << std::setw(15) <<
"-" << std::setw(15) << uncoveredtokens << std::setw(15) << uncoveredtokens / (double) this->
tokens() << std::setw(15) << this->
types() - coveredtypes << std::endl;
2078 *OUT <<
"Covered: " << std::setw(15) << this->
size() << std::setw(15) << coveredtokens << std::setw(15) << coveredtokens / (double) this->
tokens() << std::setw(15) << coveredtypes << std::endl << std::endl;
2082 bool haveoutput =
false;
2083 for (std::set<int>::iterator iterc = cache_categories.begin(); iterc != cache_categories.end(); iterc++) {
2084 const int c = *iterc;
2085 if (cache_grouptotalpatterns.count(c))
2086 for (std::set<int>::iterator itern = cache_n.begin(); itern != cache_n.end(); itern++) {
2087 const int n = *itern;
2088 if (cache_grouptotalpatterns[c].count(n)) {
2091 *OUT << std::setw(15) <<
"CATEGORY" << std::setw(15) <<
"N (SIZE) "<< std::setw(15) <<
"PATTERNS";
2093 *OUT << std::setw(15) <<
"TYPES" << std::setw(15) <<
"OCCURRENCES" << std::endl;
2098 *OUT << std::setw(15) <<
"all";
2099 }
else if (c ==
NGRAM) {
2100 *OUT << std::setw(15) <<
"n-gram";
2102 *OUT << std::setw(15) <<
"skipgram";
2104 *OUT << std::setw(15) <<
"flexgram";
2108 *OUT << std::setw(15) <<
"all";
2110 *OUT << std::setw(15) << n;
2113 *OUT << std::setw(15) << cache_grouptotalpatterns[c][n];
2116 *OUT << std::setw(15) << cache_grouptotaltokens[c][n];
2118 *OUT << std::setw(15) << cache_grouptotaltokens[c][n] / (double) this->
tokens();
2121 *OUT << std::setw(15) << cache_grouptotalwordtypes[c][n];
2123 *OUT << std::setw(15) << cache_grouptotal[c][n] << std::endl;;
2129 std::cerr << std::endl <<
"Legend:" << std::endl;
2130 std::cerr <<
" - PATTERNS : The number of distinct patterns within the group" << std::endl;
2132 std::cerr <<
" - TOKENS : The number of tokens that is covered by the patterns in the group." << std::endl;
2133 std::cerr <<
" - COVERAGE : The number of tokens covered, as a fraction of the total in the corpus" << std::endl;
2135 std::cerr <<
" - TYPES : The number of unique *word/unigram* types in this group" << std::endl;
2136 std::cerr <<
" - OCCURRENCES : The total number of occurrences of the patterns in this group" << std::endl;
2152 const int patternlength = pattern.n();
2156 std::vector<Pattern> subngrams;
2158 for (std::vector<Pattern>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) {
2159 const Pattern pattern2 = *iter2;
2191 template<
class MapType = PatternMap<IndexedData,IndexedDataHandler>,
class PatternType = Pattern>
2196 const Pattern p = iter->first;
2197 const int n = p.
n();
2198 if (n > this->
maxn) this->
maxn = n;
2199 if (n < this->
minn) this->minn = n;
2204 if (!options.
QUIET) std::cerr <<
"Sorting all indices..." << std::endl;
2206 iter->second.sort();
2221 this->attachcorpus(*corpus);
2239 this->attachcorpus(*corpus);
2243 this->
load(f,options, constrainmodel);
2258 this->attachcorpus(*corpus);
2262 std::ifstream * in =
new std::ifstream(filename.c_str());
2263 this->
load( (std::istream *) in, options, constrainmodel);
2283 if (value == NULL) {
2284 value =
getdata(pattern,
true);
2286 this->valuehandler.add(value, ref);
2289 if (value == NULL) {
2290 value =
getdata(patternpointer,
true);
2292 this->valuehandler.add(value, ref);
2300 typename MapType::iterator iter = this->find(pattern);
2301 if (iter != this->
end()) {
2302 return &(iter->second);
2303 }
else if (makeifnew) {
2304 return &((*this)[pattern]);
2311 typename MapType::iterator iter = this->find(pattern);
2312 if (iter != this->
end()) {
2313 return &(iter->second);
2314 }
else if (makeifnew) {
2315 return &((*this)[pattern]);
2322 if ((options.
DOSKIPGRAMS) && (this->reverseindex == NULL)) {
2323 std::cerr <<
"ERROR: You must specify a reverse index if you want to train skipgrams (or train skipgrams exhaustively)" << std::endl;
2330 if ((options.
DOSKIPGRAMS) && (this->reverseindex == NULL)) {
2331 std::cerr <<
"ERROR: You must specify a reverse index if you want to train skipgrams (or train skipgrams exhaustively)" << std::endl;
2342 *OUT <<
"Type: indexed" << std::endl;
2344 *OUT <<
"Type: unindexed" << std::endl;
2347 *OUT <<
"Type: unknown" << std::endl;
2349 *OUT <<
"Total tokens: " << this->
totaltokens << std::endl;
2350 *OUT <<
"Total word types: " << this->
totaltypes << std::endl;
2351 *OUT <<
"Types patterns loaded: " << this->
size() << std::endl;
2352 *OUT <<
"Min n: " << this->
minn << std::endl;
2353 *OUT <<
"Max n: " << this->
maxn << std::endl;
2355 *OUT <<
"Reverse index: yes" << std::endl;
2356 *OUT <<
"References in reverse index: " << this->
reverseindex->
size() << std::endl;
2358 *OUT <<
"Reverse index: no" << std::endl;
2360 *OUT <<
"Size of Pattern: " <<
sizeof(
Pattern) <<
" byte" << std::endl;
2361 unsigned int totalkeybs = 0;
2362 unsigned int totalvaluebs = 0;
2363 unsigned int indexlengthsum = 0;
2365 const Pattern pattern = iter->first;
2368 indexlengthsum += iter->second.size();
2370 *OUT <<
"Total key bytesize (patterns): " << totalkeybs <<
" bytes (" << (totalkeybs/1024/1024) <<
" MB)" << std::endl;
2371 *OUT <<
"Total value bytesize (counts/index): " << totalvaluebs <<
" bytes (" << (totalvaluebs/1024/1024) <<
" MB)" << std::endl;
2372 *OUT <<
"Mean key bytesize: " << (totalkeybs / (float) this->
size()) << std::endl;
2373 *OUT <<
"Mean value bytesize: " << (totalvaluebs / (float) this->
size()) << std::endl;
2374 *OUT <<
"Mean index length (ttr): " << (indexlengthsum / (float) this->
size()) << std::endl;
2376 unsigned int ri_totalkeybs = 0;
2377 unsigned int ri_totalvaluebs = 0;
2380 ri_totalkeybs +=
sizeof(iter->first.sentence) +
sizeof(iter->first.token);
2383 *OUT <<
"Total key bytesize in reverse index (references): " << ri_totalkeybs <<
" bytes (" << (ri_totalkeybs/1024/1024) <<
" MB)" << std::endl;
2384 *OUT <<
"Total value bytesize in reverse index (patterns): " << ri_totalvaluebs <<
" bytes (" << (ri_totalvaluebs/1024/1024) <<
" MB)" << std::endl;
2387 const unsigned int t = (totalkeybs + totalvaluebs + ri_totalkeybs + ri_totalvaluebs);
2388 *OUT <<
"Total bytesize (without overhead): " << t <<
" bytes (" << (t/1024/1024) <<
" MB)" << std::endl;
2399 bool haveoutput =
false;
2402 *out <<
"PATTERN\tCOUNT\tTOKENS\tCOVERAGE\tCATEGORY\tSIZE\tFREQUENCY\tREFERENCES" << std::endl;
2406 this->
print(out, decoder, pattern,
true);
2409 std::cerr << std::endl <<
"Legend:" << std::endl;
2410 std::cerr <<
" - PATTERN : The pattern, Gaps in skipgrams are represented as {*}. Variable-width gaps in flexgrams are shown using {**}." << std::endl;
2411 std::cerr <<
" - COUNT : The occurrence count - the amount of times the pattern occurs in the data" << std::endl;
2412 std::cerr <<
" - TOKENS : The number of tokens in the corpus that this pattern covers" << std::endl;
2413 std::cerr <<
" - COVERAGE : The number of tokens covered, as a fraction of the total in the corpus" << std::endl;
2414 std::cerr <<
" - CATEGORY : The pattern type category (ngram,skipgram,flexgram)" << std::endl;
2415 std::cerr <<
" - SIZE : The size of the pattern (in tokens)" << std::endl;
2416 std::cerr <<
" - FREQUENCY : The frequency of the pattern *within it's pattern type category and size-class*." << std::endl;
2417 std::cerr <<
" - REFERENCES : A space-delimited list of sentence:token position where the pattern occurs in the data. Sentences start at 1, tokens at 0" << std::endl;
2422 const std::string pattern_s = pattern.
tostring(decoder);
2426 const double freq = this->
frequency(pattern);
2427 const int cat = pattern.
category();
2431 }
else if (cat == 2) {
2433 }
else if (cat == 3) {
2436 *out << pattern_s <<
"\t" << count <<
"\t" <<
"\t" << covcount <<
"\t" << coverage <<
"\t" << cat_s <<
"\t" << pattern.
size() <<
"\t" << freq <<
"\t";
2441 *out << iter2->tostring();
2442 if (i < count) *out <<
" ";
2444 if (endline) *out << std::endl;
2456 for (
int n = 3; n <= options.
MAXLENGTH; n++) {
2458 if (!options.
QUIET) std::cerr <<
"Counting " << n <<
"-skipgrams" << std::endl;
2459 int foundskipgrams = 0;
2460 for (
typename MapType::iterator iter = this->
begin(); iter != this->
end(); iter++) {
2463 if (((
int) pattern.
n() == n) && (pattern.
category() ==
NGRAM) ) foundskipgrams += this->
computeskipgrams(pattern,options, NULL, &multirefs, constrainbymodel,
false);
2465 if (!foundskipgrams) {
2466 std::cerr <<
" None found" << std::endl;
2471 if (!options.
QUIET) std::cerr <<
" Found " << foundskipgrams <<
" skipgrams...";
2473 if (!options.
QUIET) std::cerr <<
"pruned " << pruned;
2475 if (prunedextra && !options.
QUIET) std::cerr <<
" plus " << prunedextra <<
" extra skipgrams..";
2476 if (!options.
QUIET) std::cerr <<
"...total kept: " << foundskipgrams - pruned - prunedextra << std::endl;
2486 std::cerr <<
"ERROR: getpatternfromtoken() No reverse index loaded" << std::endl;
2502 std::cerr <<
"ERROR: No corpus data loaded! (in PatternModel::getskipcontent)" << std::endl;
2507 const unsigned int n = pattern.
n();
2520 skipcontent_atref_raw.
mask = skipcontent_mask;
2524 skipcontent[skipcontent_atref] += 1;
2542 while (iter != relations.
end()) {
2543 if (iter->second < occurrencethreshold) {
2546 relations.
erase(eraseiter);
2563 std::cerr <<
"ERROR: No reverse index present" << std::endl;
2575 const int _n = pattern.
n();
2582 for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2585 if (((
int) candidate.
n() == _n) && (candidate != pattern) && (candidate.
category() ==
SKIPGRAM) && ((occurrencethreshold == 0) || (this->
occurrencecount(pattern) >= occurrencethreshold)) ) {
2586 templates[candidate] += 1;
2590 if (occurrencethreshold > 0) this->
prunerelations(templates, occurrencethreshold);
2604 std::cerr <<
"ERROR: No reverse index present" << std::endl;
2616 const int _n = pattern.
n();
2623 for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2626 if (((
int) candidate.
n() == _n) && (candidate != pattern) && (candidate.
category() ==
NGRAM) && ((occurrencethreshold == 0) || (this->
occurrencecount(pattern) >= occurrencethreshold)) ) {
2627 instances[candidate] += 1;
2631 if (occurrencethreshold > 0) this->
prunerelations(instances, occurrencethreshold);
2644 std::cerr <<
"ERROR: No reverse index present" << std::endl;
2657 const int _n = pattern.
n();
2662 for (
int i = ref.
token; i < ref.
token + _n; i++) {
2664 int maxsubn = _n - (i - ref.
token);
2670 for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2674 if (((
int) candidate.
n() <= maxsubn) && (candidate != pattern)
2675 && ((occurrencethreshold == 0) || (this->
occurrencecount(candidate) >= occurrencethreshold))
2676 && ((category == 0) || (candidate.
category() >= category))
2677 && ((
size == 0) || (candidate.
n() >=
size))
2683 subchildren[candidate] = subchildren[candidate] + 1;
2688 subchildren[candidate]++;
2694 if (occurrencethreshold > 0) this->
prunerelations(subchildren, occurrencethreshold);
2709 std::cerr <<
"ERROR: No reverse index present" << std::endl;
2719 const int _n = pattern.
n();
2727 for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2728 if ((iter2->first.sentence != ref.
sentence) || (iter2->first.token > ref.
token))
break;
2731 int minsubsize = _n + (ref.
token - iter2->first.token);
2733 if (((
int) candidate.
n() >= minsubsize) && (candidate != pattern)
2734 && ((occurrencethreshold == 0) || (this->
occurrencecount(candidate) >= occurrencethreshold))
2735 && ((category == 0) || (candidate.
category() >= category))
2736 && ((
size == 0) || (candidate.
n() >=
size))
2742 subsumes[candidate] += 1;
2747 subsumes[candidate] += 1;
2752 if (occurrencethreshold > 0) this->
prunerelations(subsumes, occurrencethreshold);
2766 std::cerr <<
"ERROR: No reverse index present" << std::endl;
2781 for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2785 && ((occurrencethreshold == 0) || (this->
occurrencecount(neighbour) >= occurrencethreshold))
2786 && ((category == 0) || (neighbour.
category() >= category))
2787 && ((
size == 0) || (neighbour.
n() >=
size))
2789 neighbours[neighbour]++;
2790 if ((cutoff > 0) && (neighbours.
size() >= cutoff))
break;
2793 if ((cutoff > 0) && (neighbours.
size() >= cutoff))
break;
2795 if (occurrencethreshold > 0) this->
prunerelations(neighbours, occurrencethreshold);
2808 std::cerr <<
"ERROR: No reverse index present" << std::endl;
2824 for (std::vector<PatternPointer>::iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2826 if ( ((occurrencethreshold == 0) || (this->
occurrencecount(neighbour) >= occurrencethreshold))
2827 && ((category == 0) || (neighbour.
category() >= category))
2828 && ((
size == 0) || (neighbour.
n() >=
size)) ) {
2829 neighbours[neighbour]++;
2830 if ((cutoff > 0) && (neighbours.
size() >= cutoff))
break;
2833 if ((cutoff > 0) && (neighbours.
size() >= cutoff))
break;
2835 if (occurrencethreshold > 0) this->
prunerelations(neighbours, occurrencethreshold);
2847 if (minskiptypes <=1)
return pruned;
2850 while(iter != this->
end()) {
2852 if (( (_n == 0) || ((
int) pattern.n() == _n) ) && (pattern.category() ==
SKIPGRAM)) {
2855 if (skipcontent2.
size() != skipcontent.
size()) {
2856 std::cerr <<
" Pattern " << pattern.hash() <<
" discrepancy!!! " << skipcontent.
size() <<
" vs " << skipcontent2.
size() << std::endl;
2860 if ((
int) skipcontent.
size() < minskiptypes) {
2862 iter = this->erase(iter);
2882 if ((this->
cache_n.size() == 1) && (*this->
cache_n.begin() == 1) && (n <= 1)) {
2886 while (iter != this->
end()) {
2894 if ((category == 0) || (*iterc == category)) {
2895 for (std::set<int>::iterator itern = this->
cache_n.begin(); itern != this->
cache_n.end(); itern++) {
2897 std::unordered_set<Pattern>
types;
2898 std::set<IndexReference>
tokens;
2900 while (iter != this->
end()) {
2901 const Pattern pattern = iter->first;
2902 const int n = pattern.
n();
2903 if ( (n == 1) && (*itern <= 1) && ((*iterc == 0) || (pattern.
category() == *iterc))) {
2904 types.insert(pattern);
2906 if (((*itern == 0) || (n == *itern)) && ((*iterc == 0) || (pattern.
category() == *iterc))) {
2907 std::vector<Pattern> unigrams;
2908 pattern.
ngrams(unigrams, 1);
2909 for (std::vector<Pattern>::iterator iter2 = unigrams.begin(); iter2 != unigrams.end(); iter2++) {
2918 for (
unsigned int i = 0; i < pattern.
n(); i++) {
2919 tokens.insert(*dataiter + i);
2941 std::cerr <<
"ERROR: No reverse index present" << std::endl;
2950 const int _n = pattern.
n();
2958 for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
2962 && ((occurrencethreshold == 0) || (this->
occurrencecount(neighbour) >= occurrencethreshold))
2963 && ((category == 0) || (neighbour.
category() >= category))
2964 && ((
size == 0) || (neighbour.
n() >=
size))
2967 if (matches != NULL) matches->
insert(ref2);
2971 if (occurrencethreshold > 0) this->
prunerelations(cooc, occurrencethreshold);
2985 std::cerr <<
"ERROR: No reverse index present" << std::endl;
2999 std::vector<std::pair<IndexReference,PatternPointer>> rindex = this->
getreverseindex_left(ref);
3000 for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
3003 const int _n = neighbour.
n();
3005 && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))
3006 && ((category == 0) || (neighbour.
category() >= category))
3007 && ((
size == 0) || (neighbour.
n() >=
size))
3013 if (occurrencethreshold > 0) this->
prunerelations(cooc, occurrencethreshold);
3028 std::cerr <<
"ERROR: No reverse index present" << std::endl;
3037 const int _n = pattern.
n();
3045 for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter2 = rindex.begin(); iter2 != rindex.end(); iter2++) {
3048 if ((ordersignificant) && (neighbour.
pattern() < pattern))
continue;
3049 const int _n2 = neighbour.
n();
3051 && ((occurrencethreshold == 0) || (this->occurrencecount(neighbour) >= occurrencethreshold))
3052 && ((category == 0) || (neighbour.
category() >= category))
3053 && ((
size == 0) || (neighbour.
n() >=
size))
3059 if (occurrencethreshold > 0) this->
prunerelations(cooc, occurrencethreshold);
3083 total += iter->second;
3085 if (total == 0)
return;
3086 double total_f = total;
3087 const std::string pattern_s = pattern.
tostring(classdecoder);
3090 *OUT <<
"\t" << pattern_s <<
"\t" << label <<
"\t" << pattern2.
tostring(classdecoder) <<
"\t" << iter->second <<
"\t" << iter->second / total_f <<
"\t" << this->
occurrencecount(pattern2) << std::endl;
3102 if (outputheader) *OUT <<
"#\tPATTERN1\tRELATION\tPATTERN2\tREL.COUNT\tREL.FREQUENCY\tCOUNT2" << std::endl;
3105 this->
outputrelations(pattern, relations, classdecoder, OUT,
"SUBSUMED-BY");
3109 this->
outputrelations(pattern, relations, classdecoder, OUT,
"SUBSUMES");
3113 this->
outputrelations(pattern, relations, classdecoder, OUT,
"RIGHT-NEIGHBOUR-OF");
3117 this->
outputrelations(pattern, relations, classdecoder, OUT,
"LEFT-NEIGHBOUR-OF");
3121 this->
outputrelations(pattern, relations, classdecoder, OUT,
"LEFT-COOC-OF");
3125 this->
outputrelations(pattern, relations, classdecoder, OUT,
"RIGHT-COOC-OF");
3129 this->
outputrelations(pattern, relations, classdecoder, OUT,
"INSTANTIATED-BY");
3141 void computenpmi( std::map<PatternPointer,t_relationmap_double> & coocmap ,
double threshold,
bool right=
true,
bool left=
true) {
3147 if ((right)&&(!left)) {
3149 }
else if ((left)&&(!right)) {
3151 }
else if (left && right) {
3156 const double value =
npmi(pattern,pattern2,iter2->second);
3157 if (value >= threshold) coocmap[pattern][pattern2] = value;
3169 void computecooc( std::map<PatternPointer,t_relationmap> & coocmap ,
int threshold,
bool right=
true,
bool left=
true) {
3173 if ((right)&&(!left)) {
3175 }
else if ((left)&&(!right)) {
3177 }
else if (left && right) {
3178 tmp = this->
getcooc(pattern, threshold);
3182 const double value = iter2->second;
3183 if (value >= threshold) coocmap[pattern][pattern2] = value;
3197 if (pattern.category() ==
SKIPGRAM) {
3198 const PatternType flexgram = pattern.toflexgram();
3199 if (!this->
has(flexgram)) count++;
3204 this->data[flexgram].
insert(ref);
3220 const unsigned char dynamicgap = 129;
3228 const double value =
npmi(pattern,pattern2,iter2->second);
3229 if (value >= threshold) {
3230 const Pattern flexgram = pattern + dynamicpattern + pattern2;
3231 if (!this->
has(flexgram)) found++;
3232 this->data[flexgram] = value;
3244 std::map<PatternPointer,t_relationmap_double> npmimap;
3245 std::cerr <<
"Collecting patterns and computing NPMI..." << std::endl;
3248 std::cerr <<
"Building inverse map..." << std::endl;
3250 std::multimap<double,std::pair<PatternPointer,PatternPointer>> inversemap;
3251 std::map<PatternPointer,t_relationmap_double>::iterator iter = npmimap.begin();
3252 while (iter != npmimap.end()) {
3254 inversemap.insert(std::pair<
double,std::pair<PatternPointer,PatternPointer>>(iter2->second, std::pair<Pattern,Pattern>(iter->first, iter2->first)));
3256 iter = npmimap.erase(iter);
3259 *OUT <<
"Pattern1\tPattern2\tNPMI" << std::endl;
3260 for (std::multimap<
double,std::pair<PatternPointer,PatternPointer>>::reverse_iterator iter2 = inversemap.rbegin(); iter2 != inversemap.rend(); iter2++) {
3263 *OUT << pattern1.
tostring(classdecoder) <<
"\t" << pattern2.
tostring(classdecoder) <<
"\t" << iter2->first << std::endl;
3272 std::map<PatternPointer,t_relationmap> coocmap;
3273 std::cerr <<
"Collecting patterns and computing co-occurrence..." << std::endl;
3276 std::cerr <<
"Building inverse map..." << std::endl;
3278 std::multimap<uint32_t,std::pair<PatternPointer,PatternPointer>> inversemap;
3279 std::map<PatternPointer,t_relationmap>::iterator iter = coocmap.begin();
3280 while (iter != coocmap.end()) {
3282 inversemap.insert(std::pair<uint32_t,std::pair<PatternPointer,PatternPointer>>(iter2->second, std::pair<PatternPointer,PatternPointer>(iter->first, iter2->first)));
3284 iter = coocmap.erase(iter);
3287 *OUT <<
"Pattern1\tPattern2\tCooc" << std::endl;
3288 for (std::multimap<uint32_t,std::pair<PatternPointer,PatternPointer>>::reverse_iterator iter2 = inversemap.rbegin(); iter2 != inversemap.rend(); iter2++) {
3289 const Pattern pattern1 = iter2->second.first;
3290 const Pattern pattern2 = iter2->second.second;
3291 *OUT << pattern1.
tostring(classdecoder) <<
"\t" << pattern2.
tostring(classdecoder) <<
"\t" << iter2->first << std::endl;
3304 std::vector<Pattern> parts;
3305 int numberofparts = pattern.
parts(parts);
3306 bool strictbegin =
true;
3307 std::multimap<int, IndexReference> partmatches;
3310 for (std::vector<std::pair<IndexReference,PatternPointer>>::
iterator iter = rindex.begin(); iter != rindex.end(); iter++) {
3313 partmatches.insert(std::pair<int,IndexReference>(i, ref));
3317 int firsttoken = begin.
token;
3319 for (
int j = 0; j < numberofparts; j++) {
3323 for (std::multimap<int, IndexReference>::iterator iter = partmatches.lower_bound(j); iter != partmatches.upper_bound(j); iter++) {
3325 if (iter->first != prevlevel) {
3329 if (((iter->second == begin) || (begin < iter->second)) && (iter->second + parts[j].n() + 1 < nextbegin)) {
3330 nextbegin = iter->second + parts[j].n() + 1;
3332 prevlevel = iter->first;
3334 if (!found)
return 0;
3336 return (nextbegin.
token - firsttoken);
3341 template<
class ValueType,
class ValueHandler = BaseValueHandler<ValueType>,
class MapType = PatternPo
interMap<ValueType, BaseValueHandler<ValueType>>>
3350 this->attachcorpus(*corpus);
3369 this->attachcorpus(*corpus);
3373 this->
load(f,options, constrainmodel);
3388 this->attachcorpus(*corpus);
3392 std::ifstream * in =
new std::ifstream(filename.c_str());
3393 this->
load( (std::istream *) in, options, constrainmodel);
3409 if ((patternpointer.
data < this->reverseindex->beginpointer()) || (patternpointer.
data > this->reverseindex->beginpointer() + this->
reverseindex->
bytesize())) {
3410 std::cerr <<
"Pattern Pointer points outside contained corpus data..." << std::endl;
3413 ValueType * data = this->
getdata(patternpointer,
true);
3417 this->
add(patternpointer, data, ref );
3429 if (value == NULL) {
3430 std::cerr <<
"Add() value is NULL!" << std::endl;
3433 this->valuehandler.add(value, ref);
3440 template<
class MapType=PatternPo
interMap<IndexedData, IndexedDataHandler>>
3449 this->attachcorpus(*corpus);
3468 this->attachcorpus(*corpus);
3472 this->
load(f,options, constrainmodel);
3487 this->attachcorpus(*corpus);
3491 std::ifstream * in =
new std::ifstream(filename.c_str());
3492 this->
load( (std::istream *) in, options, constrainmodel);
3508 if ((patternpointer.
data < this->reverseindex->beginpointer()) || (patternpointer.
data > this->reverseindex->beginpointer() + this->
reverseindex->
bytesize())) {
3509 std::cerr <<
"Pattern Pointer points outside contained corpus data..." << std::endl;
3513 this->
add(patternpointer, data, ref );
3517 if (value == NULL) {
3518 value = this->
getdata(patternpointer,
true);
3520 this->valuehandler.add(value, ref);
void outputcooc(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:3271
bool out() const
Definition: pattern.cpp:345
virtual int minlength() const =0
void write(std::ostream *out)
Definition: patternmodel.h:438
void print(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:2398
unsigned char version() const
Definition: patternmodel.h:1379
int minn
Definition: patternmodel.h:534
void report(std::ostream *OUT)
Definition: patternmodel.h:2056
virtual t_relationmap gettemplates(const Pattern &pattern, int=0)
Definition: patternmodel.h:2171
virtual t_relationmap getskipcontent(const PatternPointer &pattern)
Definition: patternmodel.h:2173
int maskheadskip(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:68
virtual t_relationmap_double getnpmi(const Pattern &pattern, double threshold)
Definition: patternmodel.h:2176
virtual void posttrain(const PatternModelOptions options)
Definition: patternmodel.h:2203
virtual void printreverseindex(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1883
virtual void load(std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:377
virtual int minlength() const
Definition: patternmodel.h:492
unsigned char type() const
Definition: patternmodel.h:1378
Definition: patternmodel.h:86
virtual t_relationmap getinstances(const Pattern &pattern, int=0)
Definition: patternmodel.h:2172
int MINSKIPTYPES
Minimum required amount of distinct patterns that can fit in a gap of a skipgram for the skipgram to ...
Definition: patternmodel.h:136
size_t size() const
Definition: patternstore.h:597
const size_t size() const
Definition: pattern.h:436
int ngrams(std::vector< PatternPointer > &container, const int n) const
Definition: pattern.cpp:1072
int MAXLENGTH
The maximum length of patterns to be loaded/extracted, inclusive (in words/tokens) (default: 100) ...
Definition: patternmodel.h:126
IndexedData * getdata(const Pattern &pattern, bool makeifnew=false)
Definition: patternmodel.h:2299
Pattern getpatternfromtoken(IndexReference ref)
Definition: patternmodel.h:2484
unsigned int totaltokensingroup(int category, int n)
Definition: patternmodel.h:1656
void printpattern(std::ostream *out, ClassDecoder &decoder, const Pattern &pattern, bool endline=true)
Definition: patternmodel.h:1935
bool erase(const Pattern &pattern)
Definition: patternstore.h:821
IndexedCorpus * reverseindex
Pointer to the reverse index and corpus data for this model (or NULL)
Definition: patternmodel.h:563
std::vector< PatternPointer > getreverseindex(const IndexReference ref, int occurrencecount=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1408
Definition: datatypes.h:477
IndexedData * getdata(const PatternPointer &pattern, bool makeifnew=false)
Definition: patternmodel.h:2310
unsigned int totalpatternsingroup(int category, int n)
Definition: patternmodel.h:1634
virtual int maxlength() const
Definition: patternmodel.h:1312
bool DOREVERSEINDEX
Obsolete now, only here for backward-compatibility with v1.
Definition: patternmodel.h:139
virtual void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:1680
virtual bool has(const Pattern &pattern) const
Definition: patternmodel.h:364
PatternMap< uint32_t, BaseValueHandler< uint32_t >, uint64_t >::iterator t_relationmap_iterator
Definition: patternmodel.h:232
virtual void load(std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:394
virtual double frequency(const Pattern &)=0
bool empty() const
Definition: patternstore.h:270
unsigned char type() const
Definition: patternmodel.h:511
Definition: patternmodel.h:73
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:278
virtual void load(std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:682
t_relationmap getsubchildren(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2642
virtual t_relationmap getsubchildren(const Pattern &pattern, int=0, int=0, int=0)
Definition: patternmodel.h:2169
virtual void add(const PatternPointer &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:1711
const bool isskipgram() const
Definition: pattern.h:170
void test(MapType &target, std::istream *in)
void printmodel(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1902
void outputcooc_npmi(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:3243
void computecooc(std::map< PatternPointer, t_relationmap > &coocmap, int threshold, bool right=true, bool left=true)
Definition: patternmodel.h:3169
int MINTOKENS
Definition: patternmodel.h:113
int getmodelversion() const
Definition: patternmodel.h:2272
Definition: pattern.h:357
bool instanceof(const Pattern &skipgram) const
Definition: pattern.cpp:1533
t_relationmap getleftcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2983
t_relationmap getskipcontent(const PatternPointer &pattern)
Definition: patternmodel.h:2499
std::vector< IndexReference >::iterator iterator
Definition: datatypes.h:109
bool DOPATTERNPERLINE
Assume each line contains one integral pattern, rather than actively extracting all subpatterns on a ...
Definition: patternmodel.h:140
Contains lower-level containers for patterns.
double comparemodels_loglikelihood(const Pattern pattern, std::vector< PatternModel< uint32_t > * > &models)
Definition: patternmodel.cpp:23
virtual void computecoveragestats(int category=0, int n=0)
Definition: patternmodel.h:1569
virtual ValueType * getdata(const Pattern &pattern, bool makeifnew=false)
Definition: patternmodel.h:1343
bool DOREMOVESKIPGRAMS
Remove skip-grams from the model upon loading it.
Definition: patternmodel.h:146
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
virtual void postread(const PatternModelOptions options)
Definition: patternmodel.h:2194
unsigned int sentences() const
Definition: patternstore.h:150
virtual void train(const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:1081
Definition: patternstore.h:156
A pattern model based on an unordered set, does not hold data, only patterns. Very suitable for loadi...
Definition: patternmodel.h:299
PatternPointer getpattern(const IndexReference &begin, int length=1) const
Definition: pattern.cpp:1764
int getmodeltype() const
Definition: patternmodel.h:3497
iterator end()
Definition: patternstore.h:813
virtual int getmodelversion() const
Definition: patternmodel.h:359
ModelType
Definition: patternmodel.h:72
const size_t bytesize() const
Definition: pattern.cpp:57
t_relationmap getcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, bool ordersignificant=false)
Definition: patternmodel.h:3026
t_relationmap getsubparents(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:2705
virtual void trainskipgrams(const PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL)
Definition: patternmodel.h:1266
double npmi(const PatternPointer &key1, const PatternPointer &key2, int jointcount)
Definition: patternmodel.h:3067
virtual int getmodeltype() const
Definition: patternmodel.h:653
t_relationmap getinstances(const Pattern &pattern, unsigned int occurrencethreshold=0)
Definition: patternmodel.h:2601
A model mapping patterns to values, gigh-level interface.
Definition: patternmodel.h:526
const size_t n() const
Definition: pattern.cpp:93
int computeflexgrams_fromskipgrams()
Definition: patternmodel.h:3192
int getmodeltype() const
Definition: patternmodel.h:3398
std::unordered_map< Pattern, ValueType >::iterator iterator
Definition: patternstore.h:807
bool DORESET
sets all counts to zero upon loading, clears indices
Definition: patternmodel.h:148
uint64_t totaltokens
Total number of tokens in the original corpus, so INCLUDES TOKENS NOT COVERED BY THE MODEL! ...
Definition: patternmodel.h:530
virtual int maxlength() const =0
bool DOSKIPGRAMS_EXHAUSTIVE
Load/extract skipgrams in an exhaustive fashion? More memory intensive, but the only options for unin...
Definition: patternmodel.h:135
void output(std::ostream *)
vector< pair< int, int > > mask2vector(const uint32_t mask, const int n)
Definition: algorithms.cpp:35
Basic read-only interface for pattern models, abstract base class.
Definition: interface.h:39
virtual t_relationmap getleftneighbours(const Pattern &pattern, int=0, int=0, int=0, int=0)
Definition: patternmodel.h:2174
t_relationmap getrightneighbours(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0)
Definition: patternmodel.h:2806
void computestats()
Definition: patternmodel.h:1526
t_relationmap getleftneighbours(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0)
Definition: patternmodel.h:2764
Limited virtual interface to pattern stores.
Definition: interface.h:20
void info(std::ostream *OUT)
Definition: patternmodel.h:2002
int getmodelversion() const
Definition: patternmodel.h:3399
virtual int getmodelversion() const =0
PatternSetModel(const std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:341
unsigned char model_type
Definition: patternmodel.h:528
virtual int minlength() const
Definition: patternmodel.h:1316
virtual int getmodelversion() const
Definition: patternmodel.h:657
void prunerelations(t_relationmap &relations, unsigned int occurrencethreshold)
Definition: patternmodel.h:2539
virtual unsigned int occurrencecount(const Pattern &pattern)
Definition: patternmodel.h:1321
void histogram(std::map< unsigned int, unsigned int > &hist, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1948
uint64_t totaltokens
Definition: patternmodel.h:303
virtual int getmodeltype() const
Definition: patternmodel.h:358
virtual void posttrain(const PatternModelOptions options)
Definition: patternmodel.h:558
virtual int computeskipgrams(const PatternPointer &pattern, PatternModelOptions &options, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, const bool exhaustive=false)
Definition: patternmodel.h:1245
void end(Measurement &m)
Definition: benchmarks.cpp:156
int MINTOKENS_UNIGRAMS
Definition: patternmodel.h:121
Class for reading an entire (class encoded) corpus into memory. It provides a reverse index by IndexR...
Definition: patternstore.h:44
PatternMap< uint32_t, BaseValueHandler< uint32_t >, uint64_t > t_relationmap
Definition: patternmodel.h:224
int MINTOKENS_SKIPGRAMS
Definition: patternmodel.h:116
uint64_t totaltypes
Definition: patternmodel.h:304
virtual void computecoveragestats(int category=0, int n=0)
Definition: patternmodel.h:2877
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_right(const IndexReference ref)
Definition: patternmodel.h:1489
void outputrelations(const PatternPointer &pattern, t_relationmap &relations, ClassDecoder &classdecoder, std::ostream *OUT, const std::string label="RELATED-TO")
Definition: patternmodel.h:3080
virtual int maxlength() const
Definition: patternmodel.h:487
Definition: patternmodel.h:88
bool DEBUG
Output extra debug information.
Definition: patternmodel.h:151
PatternSet< uint64_t >::const_iterator const_iterator
Definition: patternmodel.h:482
const PatternCategory category() const
Definition: pattern.cpp:42
ReverseIndexType
Definition: patternmodel.h:85
unsigned char version() const
Definition: patternmodel.h:515
virtual void resetstats()
Definition: patternmodel.h:1559
virtual unsigned int tokens() const =0
bool DOREMOVEFLEXGRAMS
Remove flexgrams from the model upon loading it.
Definition: patternmodel.h:147
bool DOSKIPGRAMS
Load/extract skipgrams? (default: false)
Definition: patternmodel.h:134
Definition: patternmodel.h:3441
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping...
Definition: classdecoder.h:43
MapType::const_iterator const_iterator
Definition: patternmodel.h:1307
Reference to a position in the corpus.
Definition: datatypes.h:33
Definition: patternmodel.h:75
Definition: patternmodel.h:77
A pattern map storing patterns and their values in a hash map (unordered_map).
Definition: patternstore.h:782
virtual ValueType * getdata(const PatternPointer &pattern, bool makeifnew=false)
Definition: patternmodel.h:1354
void insert(const Pattern &pattern, ValueType &value)
Definition: patternstore.h:789
void read(std::istream *in, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true)
Definition: patternstore.h:644
unsigned char classencodingversion
Definition: patternstore.h:328
PatternSet< uint64_t > extractset(int minlength=1, int maxlength=1)
Definition: patternmodel.h:2147
unsigned char model_version
Definition: patternmodel.h:529
std::map< int, std::map< int, unsigned int > > cache_grouptotal
total occurrences (used for frequency computation, within a group)
Definition: patternmodel.h:539
int getmodelversion() const
Definition: patternmodel.h:3498
int subngrams(std::vector< PatternPointer > &container, int minn=1, int maxn=9) const
Definition: pattern.cpp:1142
const size_t bytesize() const
Definition: pattern.h:435
t_relationmap gettemplates(const Pattern &pattern, unsigned int occurrencethreshold=0)
Definition: patternmodel.h:2559
PatternSet< uint64_t >::iterator iterator
Definition: patternmodel.h:481
iterator end()
Definition: datatypes.h:115
bool isgap(int i) const
Definition: pattern.cpp:126
virtual bool has(const Pattern &pattern) const
Definition: patternmodel.h:669
void write(std::ostream *out)
Definition: patternstore.h:632
virtual int getmodeltype() const =0
double frequency(const Pattern &pattern)
Definition: patternmodel.h:1666
Definition: patternmodel.h:3342
Definition: patternmodel.h:78
unsigned int totalwordtypesingroup(int category, int n)
Definition: patternmodel.h:1645
virtual int computeflexgrams_fromcooc()
Definition: patternmodel.h:2178
const size_t size() const
Definition: pattern.h:156
MapType::iterator iterator
Definition: patternmodel.h:1306
bool DOREMOVEINDEX
Do not load index information (for indexed models), loads just the patterns without any counts...
Definition: patternmodel.h:144
int maxn
Definition: patternmodel.h:533
PatternModelInterface * getinterface()
Definition: patternmodel.h:465
virtual bool has(const PatternPointer &pattern) const
Definition: patternmodel.h:672
virtual unsigned int types()=0
virtual void add(const Pattern &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:1704
bool QUIET
Don't output to stderr.
Definition: patternmodel.h:150
iterator end()
Definition: patternstore.h:224
unsigned int prune(int threshold, int _n=0)
Definition: patternmodel.h:1728
int MAXBACKOFFLENGTH
Definition: patternmodel.h:127
virtual std::vector< PatternPointer > findskipgrams(const PatternPointer &pattern, unsigned int occurrencethreshold=1, int maxskips=3)
Definition: patternmodel.h:1254
virtual unsigned int occurrencecount(const Pattern &pattern)=0
int MAXSKIPS
Maximum skips per skipgram.
Definition: patternmodel.h:137
virtual bool has(const PatternPointer &pattern) const
Definition: patternmodel.h:367
unsigned char model_type
Definition: patternmodel.h:301
void write(const std::string filename)
Definition: patternmodel.h:1299
Options for Pattern Model loading and training.
Definition: patternmodel.h:111
std::pair< IndexReference, PatternPointer > IndexPattern
Definition: patternstore.h:39
int sentencelength(int sentence) const
Definition: pattern.cpp:1806
int PRUNENONSUBSUMED
Definition: patternmodel.h:142
uint16_t token
Definition: datatypes.h:36
int ngrams(std::vector< Pattern > &container, const int n) const
Definition: pattern.cpp:1050
std::set< int > cache_n
Definition: patternmodel.h:538
size_t size() const
Definition: patternstore.h:800
iterator begin()
Definition: patternstore.h:810
int getmodeltype(const std::string &filename)
Definition: patternmodel.cpp:4
int getmodeltype() const
Definition: patternmodel.h:2271
virtual void outputcooc(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:2180
int subngrams(std::vector< Pattern > &container, int minn=1, int maxn=99) const
Definition: pattern.cpp:1120
Definition: patternmodel.h:76
void insert(IndexReference ref)
Definition: datatypes.h:106
Collection of references to position in the corpus (IndexReference). Used by Indexed Pattern models...
Definition: datatypes.h:86
virtual void print(std::ostream *out, ClassDecoder &decoder, const PatternType &pattern, bool endline=true)
Definition: patternmodel.h:1911
std::map< int, std::vector< uint32_t > > gapmasks
pre-computed masks representing possible gap configurations for various pattern lengths ...
Definition: patternmodel.h:545
bool reverseindex_internal
Definition: patternmodel.h:564
virtual t_relationmap getsubparents(const Pattern &pattern, int=0, int=0, int=0)
Definition: patternmodel.h:2170
virtual double frequency(const Pattern &)
Definition: patternmodel.h:479
PatternSetModel()
Definition: patternmodel.h:311
double coverage(const Pattern &key)
Definition: patternmodel.h:1397
PatternMap< double, BaseValueHandler< double >, uint64_t > t_relationmap_double
Definition: patternmodel.h:230
unsigned int bytesize() const
Definition: patternstore.h:118
uint32_t mask
Definition: pattern.h:362
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_left(const IndexReference ref)
Definition: patternmodel.h:1507
virtual int computeskipgrams(const PatternPointer &pattern, int mintokens=2, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, std::vector< PatternPointer > *targetcontainer=NULL, const bool exhaustive=false, const int maxskips=3, const bool DEBUG=false)
Definition: patternmodel.h:1101
const PatternCategory category() const
Definition: pattern.cpp:46
virtual void outputcooc_npmi(std::ostream *OUT, ClassDecoder &classdecoder, double threshold)
Definition: patternmodel.h:2179
virtual void train(std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:778
void add(const PatternPointer &patternpointer, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:3516
PatternPointer getsentence(int sentence) const
Definition: pattern.cpp:1826
bool DOREMOVENGRAMS
Remove n-grams from the model upon loading it.
Definition: patternmodel.h:145
std::set< int > cache_categories
Definition: patternmodel.h:537
std::vector< std::pair< IndexReference, PatternPointer > > getreverseindex_bysentence(int sentence)
Definition: patternmodel.h:1471
void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:3507
void outputrelations(const PatternPointer &pattern, ClassDecoder &classdecoder, std::ostream *OUT, bool outputheader=true)
Definition: patternmodel.h:3101
void write(const std::string &filename)
Definition: patternmodel.h:455
virtual void train(const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:2329
PatternModelInterface * getinterface()
Definition: patternmodel.h:765
Definition: patternmodel.h:74
iterator begin()
Definition: patternstore.h:214
virtual unsigned int tokens() const
Definition: patternmodel.h:505
unsigned char * data
Definition: pattern.h:360
int parts(std::vector< PatternPointer > &container) const
Definition: pattern.cpp:1337
size_t size()
Definition: patternstore.h:261
Class for encoding plain-text to binary class-encoded data.
void computenpmi(std::map< PatternPointer, t_relationmap_double > &coocmap, double threshold, bool right=true, bool left=true)
Definition: patternmodel.h:3141
const size_t n() const
Definition: pattern.cpp:89
void info(std::ostream *OUT)
Definition: patternmodel.h:2340
int pruneskipgrams(int threshold, int minskiptypes, int _n=0)
Definition: patternmodel.h:2845
unsigned int coveragecount(const Pattern &key)
Definition: patternmodel.h:1389
An indexed model mapping patterns to values, high-level interface. This is a specialised subclass of ...
Definition: patternmodel.h:2192
unsigned int topthreshold(int amount, int category=0, int size=0)
Definition: patternmodel.h:1967
uint32_t reversemask(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:58
void histogram(std::ostream *OUT, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0)
Definition: patternmodel.h:1988
virtual unsigned int occurrencecount(const PatternPointer &pattern)
Definition: patternmodel.h:1330
std::vector< IndexReference >::const_iterator const_iterator
Definition: datatypes.h:110
A pattern store in the form of an unordered set (i.e, no duplicates). Stores only patterns...
Definition: patternstore.h:538
void print(std::ostream *out, ClassDecoder &decoder, const PatternPointer &pattern, bool endline=true)
Definition: patternmodel.h:2421
uint64_t totaltypes
Total number of unigram/word types in the original corpus, SO INCLUDING NOT COVERED BY THE MODEL! ...
Definition: patternmodel.h:531
unsigned int prunenotinset(const std::unordered_set< Pattern > &s, int _n)
Definition: patternmodel.h:1784
virtual unsigned int types()
Definition: patternmodel.h:498
std::map< int, std::map< int, unsigned int > > cache_grouptotalwordtypes
total covered word types per group
Definition: patternmodel.h:541
virtual void train(std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false)
Definition: patternmodel.h:2321
int masktailskip(uint32_t mask, const unsigned int n)
Definition: algorithms.cpp:77
PatternType
Definition: pattern.h:59
virtual int computeflexgrams_fromskipgrams()
Definition: patternmodel.h:2177
PatternModelOptions(const PatternModelOptions &ref)
Definition: patternmodel.h:188
Definition: patternmodel.h:97
unsigned int prunebymodel(PatternModel< ValueType2, ValueHandler2, MapType2 > &secondmodel)
Definition: patternmodel.h:1811
int MINLENGTH
The minimum length of patterns to be loaded/extracted (in words/tokens) (default: 1) ...
Definition: patternmodel.h:125
void write(std::ostream *out)
Definition: patternmodel.h:1279
virtual unsigned int types()
Definition: patternmodel.h:1368
virtual void add(const PatternPointer &patternpointer, const IndexReference &ref)
Definition: patternmodel.h:3408
int maxn
Definition: patternmodel.h:305
Pattern pattern() const
Definition: pattern.h:527
virtual size_t size() const
Definition: patternmodel.h:662
int minn
Definition: patternmodel.h:306
virtual unsigned int occurrencecount(const Pattern &pattern)
Definition: patternmodel.h:473
virtual void load(std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:700
unsigned char getdataversion(std::istream *in)
Definition: classdecoder.cpp:257
void insert(const Pattern &pattern)
Definition: patternstore.h:580
Measurement begin(const string &title)
Definition: benchmarks.cpp:148
iterator begin()
Definition: datatypes.h:112
int parts(std::vector< Pattern > &container) const
Definition: pattern.cpp:1225
PatternModelOptions()
Definition: patternmodel.h:157
virtual void print(std::ostream *out, ClassDecoder &decoder)
Definition: patternmodel.h:1854
virtual unsigned int tokens() const
Definition: patternmodel.h:1376
unsigned char model_version
Definition: patternmodel.h:302
uint32_t sentence
Definition: datatypes.h:35
std::map< int, std::map< int, unsigned int > > cache_grouptotalpatterns
total distinct patterns per group
Definition: patternmodel.h:540
bool has(const Pattern &pattern) const
Definition: patternstore.h:587
virtual PatternStoreInterface * getstoreinterface()
Definition: patternmodel.h:288
std::vector< std::pair< Pattern, int > > getpatterns(const Pattern &pattern)
Definition: patternmodel.h:1834
PatternMap< double, BaseValueHandler< double >, uint64_t >::iterator t_relationmap_double_iterator
Definition: patternmodel.h:233
virtual t_relationmap getrightneighbours(const Pattern &pattern, int=0, int=0, int=0, int=0)
Definition: patternmodel.h:2175
virtual void trainskipgrams(PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL)
Definition: patternmodel.h:2453
bool hasskipgrams
Does this model have skipgrams?
Definition: patternmodel.h:565
Definition: patternmodel.h:87
int flexgramsize(const Pattern &pattern, IndexReference begin)
Definition: patternmodel.h:3300
std::map< int, std::map< int, unsigned int > > cache_grouptotaltokens
total covered tokens per group
Definition: patternmodel.h:542
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:283
virtual void add(const PatternPointer &patternpointer, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:2288
virtual void postread(const PatternModelOptions options)
Definition: patternmodel.h:547
virtual void add(const PatternPointer &pattern, ValueType *value, const IndexReference &ref)
Definition: patternmodel.h:3428
virtual size_t size() const
Definition: patternmodel.h:361
vector< uint32_t > compute_skip_configurations(const int n, const int maxskips)
Definition: algorithms.cpp:85
unsigned int totaloccurrencesingroup(int category, int n)
Definition: patternmodel.h:1623
int computeflexgrams_fromcooc(double threshold)
Definition: patternmodel.h:3217
virtual void add(const Pattern &pattern, IndexedData *value, const IndexReference &ref)
Definition: patternmodel.h:2282
std::string tostring() const
Definition: datatypes.h:72
PatternSetModel(std::istream *f, PatternModelOptions options, PatternModelInterface *constrainmodel=NULL)
Definition: patternmodel.h:325
virtual void outputrelations(const Pattern &pattern, ClassDecoder &classdecoder, std::ostream *OUT)
Definition: patternmodel.h:2168
t_relationmap getrightcooc(const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, IndexedData *matches=NULL)
Definition: patternmodel.h:2939
virtual unsigned int pruneskipgrams(unsigned int threshold, int minskiptypes=2, int _n=0)
Definition: patternmodel.h:1758