17 #include <unordered_map>
25 #include "libfolia/document.h"
26 #include "libfolia/folia.h"
52 std::unordered_map<std::string,unsigned int> classes;
53 unsigned int highestclass;
54 unsigned int minlength;
55 unsigned int maxlength;
66 ClassEncoder(
const unsigned int minlength = 0,
const unsigned int maxlength = 0);
74 ClassEncoder(
const std::string & filename,
const unsigned int minlength = 0,
const unsigned int maxlength = 0);
82 void load(
const std::string & filename,
const unsigned int minlength = 0,
const unsigned int maxlength = 0);
89 void build(
const std::string & filename,
unsigned int threshold=0);
97 void build(std::vector<std::string> & files,
bool quiet=
false,
unsigned int threshold =0);
109 void buildclasses(
const std::unordered_map<std::string,unsigned int> & freqlist,
unsigned int threshold =0);
116 void processcorpus(
const std::string & filename, std::unordered_map<std::string,unsigned int> & freqlist);
122 void processcorpus(std::istream * in, std::unordered_map<std::string,unsigned int> & freqlist);
129 void processfoliacorpus(
const std::string & filename, std::unordered_map<std::string,unsigned int> & freqlist);
132 std::unordered_map<unsigned int, std::string>
added;
148 int encodestring(
const std::string & line,
unsigned char * outputbuffer,
bool allowunknown,
bool autoaddunknown=
false);
159 void encodefile(
const std::string & inputfilename,
const std::string & outputfilename,
bool allowunknown,
bool autoaddunknown=
false,
bool append=
false,
bool quiet=
false);
170 void encodefile(std::istream * IN, std::ostream * OUT,
bool allowunknown,
bool autoaddunknown,
bool quiet=
false,
bool append=
false);
172 std::vector<unsigned int>
encodeseq(
const std::vector<std::string> & seq);
182 Pattern buildpattern(
const std::string & patternstring,
bool allowunknown=
false,
bool autoaddunknown =
false);
196 void add(
const std::string &,
const unsigned int cls);
206 void save(
const std::string & filename);
212 return classes.size();
222 typedef std::unordered_map<std::string, unsigned int>::const_iterator
const_iterator;
225 return classes.begin();
227 const_iterator
end()
const {
228 return classes.end();
232 unsigned int inttobytes(
unsigned char * buffer,
unsigned int cls);
234 int readline(std::istream* IN,
unsigned char* buffer,
const int);
236 unsigned char *
convert_v1_v2(
const unsigned char * olddata,
unsigned int & newlength);
237 unsigned char *
convert_v1_v2(std::istream * in,
bool ignoreeol,
bool debug);
239 const int countwords(
const unsigned char* data,
const int l);
std::unordered_map< std::string, unsigned int >::const_iterator const_iterator
Definition: classencoder.h:222
std::vector< unsigned int > encodeseq(const std::vector< std::string > &seq)
Definition: classencoder.cpp:269
const int countwords(const unsigned char *data, const int l)
unsigned char * convert_v1_v2(const unsigned char *olddata, unsigned int &newlength)
Definition: classencoder.cpp:530
unsigned char * inttobytes_v1(unsigned int, int &length)
Definition: classencoder.cpp:44
Pattern buildpattern_safe(const std::string &patternstring, bool allowunknown=false, bool autoaddunknown=false)
Definition: classencoder.cpp:396
void processcorpus(const std::string &filename, std::unordered_map< std::string, unsigned int > &freqlist)
Definition: classencoder.cpp:131
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion...
Definition: pattern.h:75
int outputlength(const std::string &line)
Definition: classencoder.cpp:277
static const unsigned char skipclass
Definition: classencoder.h:59
unsigned int inttobytes(unsigned char *buffer, unsigned int cls)
Definition: classencoder.cpp:22
int readline(std::istream *IN, unsigned char *buffer, const int)
void buildclasses(const std::unordered_map< std::string, unsigned int > &freqlist, unsigned int threshold=0)
Definition: classencoder.cpp:204
void encodefile(const std::string &inputfilename, const std::string &outputfilename, bool allowunknown, bool autoaddunknown=false, bool append=false, bool quiet=false)
Definition: classencoder.cpp:413
void build(const std::string &filename, unsigned int threshold=0)
Definition: classencoder.cpp:222
ClassEncoder(const unsigned int minlength=0, const unsigned int maxlength=0)
Definition: classencoder.cpp:81
static const unsigned char flexclass
Definition: classencoder.h:60
const_iterator begin() const
Definition: classencoder.h:224
void save(const std::string &filename)
Definition: classencoder.cpp:259
void load(const std::string &filename, const unsigned int minlength=0, const unsigned int maxlength=0)
Definition: classencoder.cpp:92
void add(const std::string &, const unsigned int cls)
Definition: classencoder.cpp:408
unsigned int gethighestclass()
Definition: classencoder.h:201
static const unsigned char delimiterclass
Definition: classencoder.h:57
static const unsigned char unknownclass
Definition: classencoder.h:58
Pattern buildpattern(const std::string &patternstring, bool allowunknown=false, bool autoaddunknown=false)
Definition: classencoder.cpp:384
int encodestring(const std::string &line, unsigned char *outputbuffer, bool allowunknown, bool autoaddunknown=false)
Definition: classencoder.cpp:323
unsigned int operator[](const std::string &key)
Definition: classencoder.h:218
Class for encoding plain-text to binary class-encoded data. The ClassEncoder maintains a mapping of w...
Definition: classencoder.h:50
Basic largely trivial functions for the common good.
std::unordered_map< unsigned int, std::string > added
Definition: classencoder.h:132
int size() const
Definition: classencoder.h:211
Contains the Pattern class that is ubiquitous throughout Colibri Core.
const_iterator end() const
Definition: classencoder.h:227