Colibri Core
classencoder.h
Go to the documentation of this file.
1 #ifndef CLASSENCODER_H
2 #define CLASSENCODER_H
3 
4 /*****************************
5 * Colibri Core
6 * by Maarten van Gompel
7 * Centre for Language Studies
8 * Radboud University Nijmegen
9 *
10 * http://proycon.github.io/colibri-core
11 *
12 * Licensed under GPLv3
13 *****************************/
14 #if HAVE_CONFIG_H
15 #include <config.h>
16 #endif
17 #include <unordered_map>
18 #include <string>
19 #include <vector>
20 #include <fstream>
21 #include <pattern.h>
22 #include <common.h>
23 
24 #ifdef WITHFOLIA
25 #include "libfolia/document.h"
26 #include "libfolia/folia.h"
27 #endif
28 
50 class ClassEncoder {
51  private:
52  std::unordered_map<std::string,unsigned int> classes;
53  unsigned int highestclass;
54  unsigned int minlength;
55  unsigned int maxlength;
56  public:
57  static const unsigned char delimiterclass = 0;
58  static const unsigned char unknownclass = 2;
59  static const unsigned char skipclass = 3;
60  static const unsigned char flexclass = 4;
66  ClassEncoder(const unsigned int minlength = 0, const unsigned int maxlength = 0);
67 
74  ClassEncoder(const std::string & filename, const unsigned int minlength = 0, const unsigned int maxlength = 0); //load an existing classer
75 
82  void load(const std::string & filename, const unsigned int minlength = 0, const unsigned int maxlength = 0); //load an existing classer
83 
89  void build(const std::string & filename, unsigned int threshold=0);
90 
97  void build(std::vector<std::string> & files, bool quiet=false, unsigned int threshold =0);
98 
99  //auxiliary functions called by build: first do processcorpus() for each
100  //corpus, then call buildclasses() once when done:
101  //
102 
109  void buildclasses(const std::unordered_map<std::string,unsigned int> & freqlist, unsigned int threshold =0);
110 
116  void processcorpus(const std::string & filename, std::unordered_map<std::string,unsigned int> & freqlist);
122  void processcorpus(std::istream * in, std::unordered_map<std::string,unsigned int> & freqlist);
123  #ifdef WITHFOLIA
124 
129  void processfoliacorpus(const std::string & filename, std::unordered_map<std::string,unsigned int> & freqlist);
130  #endif
131 
132  std::unordered_map<unsigned int, std::string> added;
133 
134 
138  int outputlength(const std::string & line);
139 
148  int encodestring(const std::string & line, unsigned char * outputbuffer, bool allowunknown, bool autoaddunknown=false);
149 
159  void encodefile(const std::string & inputfilename, const std::string & outputfilename, bool allowunknown, bool autoaddunknown=false, bool append=false, bool quiet=false);
170  void encodefile(std::istream * IN, std::ostream * OUT, bool allowunknown, bool autoaddunknown, bool quiet=false, bool append=false);
171 
172  std::vector<unsigned int> encodeseq(const std::vector<std::string> & seq);
173 
182  Pattern buildpattern(const std::string & patternstring, bool allowunknown=false, bool autoaddunknown = false); //not thread-safe
190  Pattern buildpattern_safe(const std::string & patternstring, bool allowunknown=false, bool autoaddunknown = false); //thread-safe
191 
192 
196  void add(const std::string &, const unsigned int cls);
197 
201  unsigned int gethighestclass() { return highestclass; }
202 
206  void save(const std::string & filename);
207 
211  int size() const {
212  return classes.size();
213  }
214 
218  unsigned int operator[](const std::string & key) {
219  return classes[key];
220  }
221 
222  typedef std::unordered_map<std::string, unsigned int>::const_iterator const_iterator;
223 
224  const_iterator begin() const {
225  return classes.begin();
226  }
227  const_iterator end() const {
228  return classes.end();
229  }
230 };
231 
232 unsigned int inttobytes(unsigned char * buffer, unsigned int cls);
233 unsigned char * inttobytes_v1(unsigned int, int & length);
234 int readline(std::istream* IN, unsigned char* buffer, const int);
235 
236 unsigned char * convert_v1_v2(const unsigned char * olddata, unsigned int & newlength);
237 unsigned char * convert_v1_v2(std::istream * in, bool ignoreeol, bool debug);
238 
239 const int countwords(const unsigned char* data, const int l);
240 #endif
std::unordered_map< std::string, unsigned int >::const_iterator const_iterator
Definition: classencoder.h:222
std::vector< unsigned int > encodeseq(const std::vector< std::string > &seq)
Definition: classencoder.cpp:269
const int countwords(const unsigned char *data, const int l)
unsigned char * convert_v1_v2(const unsigned char *olddata, unsigned int &newlength)
Definition: classencoder.cpp:530
unsigned char * inttobytes_v1(unsigned int, int &length)
Definition: classencoder.cpp:44
Pattern buildpattern_safe(const std::string &patternstring, bool allowunknown=false, bool autoaddunknown=false)
Definition: classencoder.cpp:396
void processcorpus(const std::string &filename, std::unordered_map< std::string, unsigned int > &freqlist)
Definition: classencoder.cpp:131
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
int outputlength(const std::string &line)
Definition: classencoder.cpp:277
static const unsigned char skipclass
Definition: classencoder.h:59
unsigned int inttobytes(unsigned char *buffer, unsigned int cls)
Definition: classencoder.cpp:22
int readline(std::istream *IN, unsigned char *buffer, const int)
void buildclasses(const std::unordered_map< std::string, unsigned int > &freqlist, unsigned int threshold=0)
Definition: classencoder.cpp:204
void encodefile(const std::string &inputfilename, const std::string &outputfilename, bool allowunknown, bool autoaddunknown=false, bool append=false, bool quiet=false)
Definition: classencoder.cpp:413
void build(const std::string &filename, unsigned int threshold=0)
Definition: classencoder.cpp:222
ClassEncoder(const unsigned int minlength=0, const unsigned int maxlength=0)
Definition: classencoder.cpp:81
static const unsigned char flexclass
Definition: classencoder.h:60
const_iterator begin() const
Definition: classencoder.h:224
void save(const std::string &filename)
Definition: classencoder.cpp:259
void load(const std::string &filename, const unsigned int minlength=0, const unsigned int maxlength=0)
Definition: classencoder.cpp:92
void add(const std::string &, const unsigned int cls)
Definition: classencoder.cpp:408
unsigned int gethighestclass()
Definition: classencoder.h:201
static const unsigned char delimiterclass
Definition: classencoder.h:57
static const unsigned char unknownclass
Definition: classencoder.h:58
Pattern buildpattern(const std::string &patternstring, bool allowunknown=false, bool autoaddunknown=false)
Definition: classencoder.cpp:384
int encodestring(const std::string &line, unsigned char *outputbuffer, bool allowunknown, bool autoaddunknown=false)
Definition: classencoder.cpp:323
unsigned int operator[](const std::string &key)
Definition: classencoder.h:218
Class for encoding plain-text to binary class-encoded data. The ClassEncoder maintains a mapping of w...
Definition: classencoder.h:50
Basic largely trivial functions for the common good.
std::unordered_map< unsigned int, std::string > added
Definition: classencoder.h:132
int size() const
Definition: classencoder.h:211
Contains the Pattern class that is ubiquitous throughout Colibri Core.
const_iterator end() const
Definition: classencoder.h:227