Colibri Core
alignmodel.h
Go to the documentation of this file.
1 #ifndef ALIGNMODEL_H
2 #define ALIGNMODEL_H
3 
4 #include "patternmodel.h"
5 
6 
7 template<class FeatureType>
8 class PatternAlignmentModel: public PatternMap<PatternFeatureVectorMap<FeatureType>,PatternFeatureVectorMapHandler<FeatureType>> {
9  protected:
10  //some duplication from PatternModel, but didn't want to inherit from
11  //it, as too much is different
12  unsigned char model_type;
13  unsigned char model_version;
14  uint64_t totaltokens; //INCLUDES TOKENS NOT COVERED BY THE MODEL!
15  uint64_t totaltypes; //TOTAL UNIGRAM TYPES, INCLUDING NOT COVERED BY THE MODEL!
16 
17  int maxn;
18  int minn;
19 
20 
21  virtual void postread(const PatternModelOptions options) {
22  //this function has a specialisation specific to indexed pattern models,
23  //this is the generic version
24  for (iterator iter = this->begin(); iter != this->end(); iter++) {
25  const Pattern p = iter->first;
26  const int n = p.n();
27  if (n > maxn) maxn = n;
28  if (n < minn) minn = n;
29  }
30  }
31  public:
33  totaltokens = 0;
34  totaltypes = 0;
35  maxn = 0;
36  minn = 999;
37  model_type = this->getmodeltype();
38  model_version = this->getmodelversion();
39  }
40  PatternAlignmentModel<FeatureType>(std::istream *f, PatternModelOptions options, PatternModelInterface * constrainmodel = NULL) { //load from file
41  totaltokens = 0;
42  totaltypes = 0;
43  maxn = 0;
44  minn = 999;
45  model_type = this->getmodeltype();
46  model_version = this->getmodelversion();
47  this->load(f,options,constrainmodel);
48  }
49 
50  PatternAlignmentModel<FeatureType>(const std::string filename, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL) { //load from file
51  totaltokens = 0;
52  totaltypes = 0;
53  maxn = 0;
54  minn = 999;
55  model_type = this->getmodeltype();
56  model_version = this->getmodelversion();
57  if (!options.QUIET) std::cerr << "Loading " << filename << std::endl;
58  std::ifstream * in = new std::ifstream(filename.c_str());
59  if (!in->good()) {
60  std::cerr << "ERROR: Unable to load file " << filename << std::endl;
61  throw InternalError();
62  }
63  this->load( (std::istream *) in, options, constrainmodel);
64  in->close();
65  delete in;
66  }
67 
68  virtual int getmodeltype() const { return PATTERNALIGNMENTMODEL; }
69  virtual int getmodelversion() const { return 2; }
70 
71  virtual size_t size() const {
73  }
74  virtual bool has(const Pattern & pattern) const {
76  }
77  virtual bool has(const PatternPointer & pattern) const {
79  }
80 
81  virtual void load(std::string filename, const PatternModelOptions options, PatternModelInterface * constrainmodel = NULL) {
82  if (!options.QUIET) std::cerr << "Loading " << filename << std::endl;
83  std::ifstream * in = new std::ifstream(filename.c_str());
84  if (!in->good()) {
85  std::cerr << "ERROR: Unable to load file " << filename << std::endl;
86  throw InternalError();
87  }
88  this->load( (std::istream *) in, options, constrainmodel);
89  in->close();
90  delete in;
91  }
92 
93  virtual void load(std::istream * f, PatternModelOptions options, PatternModelInterface * constrainmodel = NULL) { //load from file
94  options.MINTOKENS = 1; //other values would be meaningless
95 
96  char null;
97  f->read( (char*) &null, sizeof(char));
98  f->read( (char*) &model_type, sizeof(char));
99  f->read( (char*) &model_version, sizeof(char));
100  if (model_version == 1) this->classencodingversion = 1;
101  if ((null != 0) || (model_type != PATTERNALIGNMENTMODEL )) {
102  std::cerr << "File is not a colibri alignment model file (did you try to load a different type of pattern model?)" << std::endl;
103  throw InternalError();
104  }
105  if (model_version > 2) {
106  std::cerr << "WARNING: Model is created with a newer version of Colibri Core! Attempting to continue but failure is likely..." << std::endl;
107  }
108  f->read( (char*) &totaltokens, sizeof(uint64_t));
109  f->read( (char*) &totaltypes, sizeof(uint64_t));
110 
111  if (options.DEBUG) {
112  std::cerr << "Debug enabled, loading Alignment Model type " << (int) model_type << ", version " << (int) model_version << std::endl;
113  std::cerr << "Total tokens: " << totaltokens << ", total types: " << totaltypes << std::endl;;
114  }
115 
116  PatternStoreInterface * constrainstore = NULL;
117  if (constrainmodel) constrainstore = constrainmodel->getstoreinterface();
118 
119  PatternMap<PatternFeatureVectorMap<FeatureType>,PatternFeatureVectorMapHandler<FeatureType>>::template read(f, options.MINTOKENS,options.MINLENGTH, options.MAXLENGTH, constrainstore, !options.DOREMOVENGRAMS, !options.DOREMOVESKIPGRAMS, !options.DOREMOVEFLEXGRAMS, options.DORESET, options.DEBUG);
120  if (options.DEBUG) std::cerr << "Read " << this->size() << " patterns" << std::endl;
121  this->postread(options);
122  }
123 
125  return (PatternModelInterface*) this;
126  }
127 
128  void write(std::ostream * out) {
129  const char null = 0;
130  out->write( (char*) &null, sizeof(char));
131  unsigned char t = this->getmodeltype();
132  out->write( (char*) &t, sizeof(char));
133  unsigned char v = this->getmodelversion();
134  out->write( (char*) &v, sizeof(char));
135  out->write( (char*) &totaltokens, sizeof(uint64_t));
136  out->write( (char*) &totaltypes, sizeof(uint64_t));
138  }
139 
140  void write(const std::string filename) {
141  std::ofstream * out = new std::ofstream(filename.c_str());
142  this->write(out);
143  out->close();
144  delete out;
145  }
146 
148  typedef typename PatternMap<PatternFeatureVectorMap<FeatureType>,PatternFeatureVectorMapHandler<FeatureType>>::const_iterator const_iterator;
149 
150 
151  virtual int maxlength() const { return maxn; };
152  virtual int minlength() const { return minn; };
153 
154  virtual int occurrencecount(const Pattern & pattern) {
155  return 0; // we don't do occurrence counts
156  }
157 
158  virtual PatternFeatureVectorMap<FeatureType> * getdata(const Pattern & pattern, bool makeifnew=false) {
159  typename PatternMap<PatternFeatureVectorMap<FeatureType>,PatternFeatureVectorMapHandler<FeatureType>>::iterator iter = this->find(pattern);
160  if (iter != this->end()) {
161  return &(iter->second);
162  } else if (makeifnew) {
163  return &((*this)[pattern]);
164  } else {
165  return NULL;
166  }
167  }
168 
169  virtual PatternFeatureVectorMap<FeatureType> * getdata(const PatternPointer & patternpointer, bool makeifnew=false) {
170  const Pattern pattern = Pattern(patternpointer);
171  typename PatternMap<PatternFeatureVectorMap<FeatureType>,PatternFeatureVectorMapHandler<FeatureType>>::iterator iter = this->find(pattern);
172  if (iter != this->end()) {
173  return &(iter->second);
174  } else if (makeifnew) {
175  return &((*this)[pattern]);
176  } else {
177  return NULL;
178  }
179  }
180 
181  //not really useful in this context
182  int types() const { return totaltypes; }
183  int tokens() const { return totaltokens; }
184 
185  unsigned char type() const { return model_type; }
186  unsigned char version() const { return model_version; }
187 
188 
189  //(source,target) pair versions of has, getdata
190  virtual bool has(const Pattern & pattern, const Pattern & pattern2) {
191  return (this->has(pattern) && this->getdata(pattern)->has(pattern2));
192  }
193  virtual bool has(const PatternPointer & patternpointer, const PatternPointer & patternpointer2) {
194  const Pattern pattern2 = Pattern(patternpointer2);
195  return (this->has(patternpointer) && this->getdata(patternpointer)->has(pattern2));
196  }
197 
198  virtual PatternFeatureVector<FeatureType> * getdata(const Pattern & pattern, const Pattern & pattern2, bool makeifnew=false) {
199  PatternFeatureVectorMap<FeatureType> * fvmap = this->getdata(pattern, makeifnew);
200  if (fvmap == NULL) return NULL;
201  return fvmap->getdata(pattern2);
202  }
203 
204  void add(const Pattern & pattern, const Pattern & pattern2, std::vector<FeatureType> & features, bool checkifexists= true) {
206  if (checkifexists) {
207  fv = getdata(pattern,pattern2,true);
208  }
209  if (fv == NULL) {
210  PatternFeatureVectorMap<FeatureType> * fvm = this->getdata(pattern, true);
212  fvm->insert(pfv, checkifexists); //(will be freed again by fvm destructor)
213  } else {
214  fv->clear(); //will be overwritten by new features
215  for (typename std::vector<FeatureType>::iterator iter = features.begin(); iter != features.end(); iter++) {
216  fv->push_back(*iter);
217  }
218  }
219  }
220 
221  virtual void printmodel(std::ostream * out, ClassDecoder & sourcedecoder, ClassDecoder & targetdecoder) { //alias for cython (doesn't like methods named print)
222  print(out,sourcedecoder, targetdecoder);
223  }
224 
225  virtual void print(std::ostream * out, ClassDecoder & sourcedecoder, ClassDecoder & targetdecoder) {
226  *out << "PATTERN\tPATTERN2\tFEATURES" << std::endl;
227  for (iterator iter = this->begin(); iter != this->end(); iter++) {
228  const Pattern sourcepattern = iter->first;
229  for (typename PatternFeatureVectorMap<FeatureType>::iterator iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++) {
230  PatternFeatureVector<FeatureType> * pfv = *iter2;
231  const Pattern targetpattern = pfv->pattern;
232  *out << sourcepattern.tostring(sourcedecoder) << "\t" << targetpattern.tostring(targetdecoder);
233  for (typename std::vector<FeatureType>::iterator iter3 = pfv->data.begin(); iter3 != pfv->data.end(); iter3++) {
234  *out << "\t" << *iter3;
235  }
236  *out << std::endl;
237  }
238  }
239  }
240 };
241 
242 
243 #endif
virtual PatternFeatureVectorMap< FeatureType > * getdata(const Pattern &pattern, bool makeifnew=false)
Definition: alignmodel.h:158
int MAXLENGTH
The maximum length of patterns to be loaded/extracted, inclusive (in words/tokens) (default: 100) ...
Definition: patternmodel.h:126
void read(std::istream *in, int MINTOKENS=0, int MINLENGTH=0, int MAXLENGTH=999999, PatternStoreInterface *constrainstore=NULL, bool DONGRAMS=true, bool DOSKIPGRAMS=true, bool DOFLEXGRAMS=true, bool DORESET=false, bool DEBUG=false)
Definition: patternstore.h:457
virtual bool has(const Pattern &pattern) const
Definition: alignmodel.h:74
Definition: datatypes.h:477
virtual size_t size() const
Definition: alignmodel.h:71
unsigned char model_type
Definition: alignmodel.h:12
std::string tostring(const ClassDecoder &classdecoder) const
Definition: pattern.cpp:278
void push_back(FeatureType &f)
Definition: datatypes.h:339
virtual PatternFeatureVectorMap< FeatureType > * getdata(const PatternPointer &patternpointer, bool makeifnew=false)
Definition: alignmodel.h:169
int MINTOKENS
Definition: patternmodel.h:113
Definition: pattern.h:357
bool DOREMOVESKIPGRAMS
Remove skip-grams from the model upon loading it.
Definition: patternmodel.h:146
void write(std::ostream *out)
Definition: alignmodel.h:128
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
PatternMap< PatternFeatureVectorMap< FeatureType >, PatternFeatureVectorMapHandler< FeatureType > >::const_iterator const_iterator
Definition: alignmodel.h:148
std::vector< PatternFeatureVector< FeatureType > * >::iterator iterator
Definition: datatypes.h:359
virtual bool has(const PatternPointer &patternpointer, const PatternPointer &patternpointer2)
Definition: alignmodel.h:193
unsigned char type() const
Definition: alignmodel.h:185
bool DORESET
sets all counts to zero upon loading, clears indices
Definition: patternmodel.h:148
virtual void postread(const PatternModelOptions options)
Definition: alignmodel.h:21
unsigned char version() const
Definition: alignmodel.h:186
virtual bool has(const Pattern &pattern, const Pattern &pattern2)
Definition: alignmodel.h:190
Basic read-only interface for pattern models, abstract base class.
Definition: interface.h:39
virtual void load(std::string filename, const PatternModelOptions options, PatternModelInterface *constrainmodel=NULL)
Definition: alignmodel.h:81
Limited virtual interface to pattern stores.
Definition: interface.h:20
Definition: alignmodel.h:8
virtual int getmodelversion() const
Definition: alignmodel.h:69
virtual void load(std::istream *f, PatternModelOptions options, PatternModelInterface *constrainmodel=NULL)
Definition: alignmodel.h:93
bool DEBUG
Output extra debug information.
Definition: patternmodel.h:151
bool DOREMOVEFLEXGRAMS
Remove flexgrams from the model upon loading it.
Definition: patternmodel.h:147
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping...
Definition: classdecoder.h:43
void write(const std::string filename)
Definition: alignmodel.h:140
virtual bool has(const PatternPointer &pattern) const
Definition: alignmodel.h:77
virtual int getmodeltype() const
Definition: alignmodel.h:68
virtual void printmodel(std::ostream *out, ClassDecoder &sourcedecoder, ClassDecoder &targetdecoder)
Definition: alignmodel.h:221
A pattern map storing patterns and their values in a hash map (unordered_map).
Definition: patternstore.h:782
virtual int occurrencecount(const Pattern &pattern)
Definition: alignmodel.h:154
virtual void print(std::ostream *out, ClassDecoder &sourcedecoder, ClassDecoder &targetdecoder)
Definition: alignmodel.h:225
Contains classes for Pattern Models.
virtual int minlength() const
Definition: alignmodel.h:152
void add(const Pattern &pattern, const Pattern &pattern2, std::vector< FeatureType > &features, bool checkifexists=true)
Definition: alignmodel.h:204
Pattern pattern
Definition: datatypes.h:272
Definition: patternmodel.h:78
PatternMap< PatternFeatureVectorMap< FeatureType >, PatternFeatureVectorMapHandler< FeatureType > >::iterator iterator
Definition: alignmodel.h:147
void insert(PatternFeatureVector< FeatureType > *pfv, bool checkexists=true)
Definition: datatypes.h:410
bool QUIET
Don't output to stderr.
Definition: patternmodel.h:150
Options for Pattern Model loading and training.
Definition: patternmodel.h:111
int types() const
Definition: alignmodel.h:182
int maxn
Definition: alignmodel.h:17
std::vector< FeatureType > data
Definition: datatypes.h:273
uint64_t totaltokens
Definition: alignmodel.h:14
virtual PatternFeatureVector< FeatureType > * getdata(const Pattern &pattern, const Pattern &pattern2, bool makeifnew=false)
Definition: alignmodel.h:198
bool DOREMOVENGRAMS
Remove n-grams from the model upon loading it.
Definition: patternmodel.h:145
int minn
Definition: alignmodel.h:18
const size_t n() const
Definition: pattern.cpp:89
Definition: common.h:35
uint64_t totaltypes
Definition: alignmodel.h:15
void clear()
Definition: datatypes.h:336
int MINLENGTH
The minimum length of patterns to be loaded/extracted (in words/tokens) (default: 1) ...
Definition: patternmodel.h:125
Definition: datatypes.h:270
int tokens() const
Definition: alignmodel.h:183
virtual int maxlength() const
Definition: alignmodel.h:151
PatternModelInterface * getinterface()
Definition: alignmodel.h:124
Definition: datatypes.h:352
unsigned char model_version
Definition: alignmodel.h:13
virtual PatternFeatureVector< FeatureType > * getdata(const Pattern &pattern)
Definition: datatypes.h:457