Class for encoding plain-text to binary class-encoded data. The ClassEncoder maintains a mapping of words to classes (integers). It allows a corpus to be losslessly compressed by substituting words for classes. The classes are distributed based on word frequency, with frequent words receiving a lower class number that can be represented in fewer bytes, and rare words receiving a higher class number.
More...
#include <classencoder.h>
|
| ClassEncoder (const unsigned int minlength=0, const unsigned int maxlength=0) |
|
| ClassEncoder (const std::string &filename, const unsigned int minlength=0, const unsigned int maxlength=0) |
|
void | load (const std::string &filename, const unsigned int minlength=0, const unsigned int maxlength=0) |
|
void | build (const std::string &filename, unsigned int threshold=0) |
|
void | build (std::vector< std::string > &files, bool quiet=false, unsigned int threshold=0) |
|
void | buildclasses (const std::unordered_map< std::string, unsigned int > &freqlist, unsigned int threshold=0) |
|
void | processcorpus (const std::string &filename, std::unordered_map< std::string, unsigned int > &freqlist) |
|
void | processcorpus (std::istream *in, std::unordered_map< std::string, unsigned int > &freqlist) |
|
int | outputlength (const std::string &line) |
|
int | encodestring (const std::string &line, unsigned char *outputbuffer, bool allowunknown, bool autoaddunknown=false) |
|
void | encodefile (const std::string &inputfilename, const std::string &outputfilename, bool allowunknown, bool autoaddunknown=false, bool append=false, bool quiet=false) |
|
void | encodefile (std::istream *IN, std::ostream *OUT, bool allowunknown, bool autoaddunknown, bool quiet=false, bool append=false) |
|
std::vector< unsigned int > | encodeseq (const std::vector< std::string > &seq) |
|
Pattern | buildpattern (const std::string &patternstring, bool allowunknown=false, bool autoaddunknown=false) |
|
Pattern | buildpattern_safe (const std::string &patternstring, bool allowunknown=false, bool autoaddunknown=false) |
|
void | add (const std::string &, const unsigned int cls) |
|
unsigned int | gethighestclass () |
|
void | save (const std::string &filename) |
|
int | size () const |
|
unsigned int | operator[] (const std::string &key) |
|
const_iterator | begin () const |
|
const_iterator | end () const |
|
|
std::unordered_map< unsigned int, std::string > | added |
|
Class for encoding plain-text to binary class-encoded data. The ClassEncoder maintains a mapping of words to classes (integers). It allows a corpus to be losslessly compressed by substituting words for classes. The classes are distributed based on word frequency, with frequent words receiving a lower class number that can be represented in fewer bytes, and rare words receiving a higher class number.
ClassEncoder::ClassEncoder |
( |
const unsigned int |
minlength = 0 , |
|
|
const unsigned int |
maxlength = 0 |
|
) |
| |
Constructor for an empty ClassEncoder
- Parameters
-
minlength | Minimum supported length of words (default: 0) |
maxlength | Maximum supported length of words (default: 0 = unlimited) |
ClassEncoder::ClassEncoder |
( |
const std::string & |
filename, |
|
|
const unsigned int |
minlength = 0 , |
|
|
const unsigned int |
maxlength = 0 |
|
) |
| |
Constructor for a ClassEncoder read from file
- Parameters
-
filename | The filename (*.colibri.cls) |
minlength | Minimum supported length of words (default: 0) |
maxlength | Maximum supported length of words (default: 0 = unlimited) |
void ClassEncoder::add |
( |
const std::string & |
s, |
|
|
const unsigned int |
cls |
|
) |
| |
Add the word with the specified class to the class encoding
void ClassEncoder::build |
( |
const std::string & |
filename, |
|
|
unsigned int |
threshold = 0 |
|
) |
| |
Build a class encoding from a plain-text corpus
- Parameters
-
filename | A plain text corpus with the units of interest (e.g sentences) each on one line |
threshold | Occurrence threshold, words occurring less will be pruned |
void ClassEncoder::build |
( |
std::vector< std::string > & |
files, |
|
|
bool |
quiet = false , |
|
|
unsigned int |
threshold = 0 |
|
) |
| |
Build a class encoding from multiple plain-text corpus files
- Parameters
-
files | A list of plain text corpus files with the units of interest (e.g sentences) each on one line |
quiet | If true, do not output progress to stderr (default: false) |
threshold | Occurrence threshold, words occurring less will be pruned |
void ClassEncoder::buildclasses |
( |
const std::unordered_map< std::string, unsigned int > & |
freqlist, |
|
|
unsigned int |
threshold = 0 |
|
) |
| |
Assign classes based on the computed frequency list. This method should only be called once.
- Parameters
-
freqlist | The data structure that will contain the frequency list |
threshold | Occurrence threshold, words occurring less will be pruned |
Pattern ClassEncoder::buildpattern |
( |
const std::string & |
patternstring, |
|
|
bool |
allowunknown = false , |
|
|
bool |
autoaddunknown = false |
|
) |
| |
Build a pattern from a string. Note: This function is not thread-safe! Use buildpattern_safe() instead if you need thread safety!
- Parameters
-
patternstring | The string you want to turn into a Pattern |
allowunknown | If the string contains unknown words, represent those using a single unknown class. If set to false, an exception will be raised when unknown words are present. (default: false) |
autoaddunknown | If the string contains unknown words, automatically add these words to the class encoding. Note that the class encoding will no longer be optimal if this is used. (default: false) |
- Returns
- a Pattern
Pattern ClassEncoder::buildpattern_safe |
( |
const std::string & |
patternstring, |
|
|
bool |
allowunknown = false , |
|
|
bool |
autoaddunknown = false |
|
) |
| |
Build a pattern from a string (thread-safe variant, slightly slower due to buffer allocation)
- Parameters
-
patternstring | The string you want to turn into a Pattern |
allowunknown | If the string contains unknown words, represent those using a single unknown class. If set to false, an exception will be raised when unknown words are present. (default: false) |
autoaddunknown | If the string contains unknown words, automatically add these words to the class encoding. Note that the class encoding will no longer be optimal if this is used. (default: false) |
- Returns
- a Pattern
void ClassEncoder::encodefile |
( |
const std::string & |
inputfilename, |
|
|
const std::string & |
outputfilename, |
|
|
bool |
allowunknown, |
|
|
bool |
autoaddunknown = false , |
|
|
bool |
append = false , |
|
|
bool |
quiet = false |
|
) |
| |
Create a class-encoded corpus file from a plain-text corpus file. Each of the units of interest (e.g sentences) should occupy a single line (i.e.,
delimited)
- Parameters
-
inputfilename | Filename of the input file, a plain-text corpus file |
outputfilename | Filename of the output file (binary class-encoded corpus file, *.colibri.dat) |
allowunknown | If the string contains unknown words, represent those using a single unknown class. If set to false, an exception will be raised when unknown words are present. (default: false) |
autoaddunknown | If the string contains unknown words, automatically add these words to the class encoding. Note that the class encoding will no longer be optimal if this is used. (default: false) |
append | Set to true if this is not the first file to write to the stream |
- Returns
- The number of bytes written to outputbuffer
void ClassEncoder::encodefile |
( |
std::istream * |
IN, |
|
|
std::ostream * |
OUT, |
|
|
bool |
allowunknown, |
|
|
bool |
autoaddunknown, |
|
|
bool |
quiet = false , |
|
|
bool |
append = false |
|
) |
| |
Create a class-encoded corpus file from a plain-text corpus file. Each of the units of interest (e.g sentences) should occupy a single line (i.e.,
delimited)
- Parameters
-
IN | Input stream of a plain-text corpus file |
OUT | Output stream of a binary class-encoded corpus file (*.colibri.dat) |
allowunknown | If the string contains unknown words, represent those using a single unknown class. If set to false, an exception will be raised when unknown words are present. (default: false) |
autoaddunknown | If the string contains unknown words, automatically add these words to the class encoding. Note that the class encoding will no longer be optimal if this is used. (default: false) |
quiet | Set to true to suppress any output |
append | Set to true if this is not the first file to write to the stream |
- Returns
- The number of bytes written to outputbuffer
vector< unsigned int > ClassEncoder::encodeseq |
( |
const std::vector< std::string > & |
seq | ) |
|
int ClassEncoder::encodestring |
( |
const std::string & |
line, |
|
|
unsigned char * |
outputbuffer, |
|
|
bool |
allowunknown, |
|
|
bool |
autoaddunknown = false |
|
) |
| |
Low-level function to encode a string of words as a binary representation of classes
- Parameters
-
line | The string you want to turn into a Pattern |
outputbuffer | Pointer to the output buffer, must be pre-allocated and have enough space |
allowunknown | If the string contains unknown words, represent those using a single unknown class. If set to false, an exception will be raised when unknown words are present. (default: false) |
autoaddunknown | If the string contains unknown words, automatically add these words to the class encoding. Note that the class encoding will no longer be optimal if this is used. (default: false) |
- Returns
- The number of bytes written to outputbuffer
unsigned int ClassEncoder::gethighestclass |
( |
| ) |
|
|
inline |
Returns the highest assigned class in the class encoding
void ClassEncoder::load |
( |
const std::string & |
filename, |
|
|
const unsigned int |
minlength = 0 , |
|
|
const unsigned int |
maxlength = 0 |
|
) |
| |
Load a class encoding from file
- Parameters
-
filename | The filename (*.colibri.cls) |
minlength | Minimum supported length of words (default: 0) |
maxlength | Maximum supported length of words (default: 0 = unlimited) |
unsigned int ClassEncoder::operator[] |
( |
const std::string & |
key | ) |
|
|
inline |
Return the class for the given word
int ClassEncoder::outputlength |
( |
const std::string & |
line | ) |
|
Computes how many bytes the class repesentation for this input line would take
void ClassEncoder::processcorpus |
( |
const std::string & |
filename, |
|
|
std::unordered_map< std::string, unsigned int > & |
freqlist |
|
) |
| |
Count word frequency in a given plain-text corpus.
- Parameters
-
filename | The corpus file |
freqlist | The resulting frequency list, should be shared between multiple calls to processcorpus() |
void ClassEncoder::processcorpus |
( |
std::istream * |
in, |
|
|
std::unordered_map< std::string, unsigned int > & |
freqlist |
|
) |
| |
Count word frequency in a given plain-text corpus.
- Parameters
-
in | The input stream |
freqlist | The resulting frequency list, should be shared between multiple calls to processcorpus() |
void ClassEncoder::save |
( |
const std::string & |
filename | ) |
|
Save the class encoding to file
int ClassEncoder::size |
( |
| ) |
const |
|
inline |
Returns the number of classes, i.e. word types
std::unordered_map<unsigned int, std::string> ClassEncoder::added |
const unsigned char ClassEncoder::delimiterclass = 0 |
|
static |
const unsigned char ClassEncoder::flexclass = 4 |
|
static |
const unsigned char ClassEncoder::skipclass = 3 |
|
static |
const unsigned char ClassEncoder::unknownclass = 2 |
|
static |
The documentation for this class was generated from the following files: