Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping of classes (integers) to words. It allows decoding of a corpus that was losslessly compressed by substituting words for classes. The classes are distributed based on word frequency, with frequent words receiving a lower class number that can be represented in fewer bytes, and rare words receiving a higher class number.
More...
#include <classdecoder.h>
|
| ClassDecoder () |
|
| ClassDecoder (const std::string &filename) |
|
void | load (const std::string &filename) |
|
std::vector< std::string > | decodeseq (const std::vector< int > &seq) |
|
void | decodefile (const std::string &filename, std::ostream *, unsigned int start=0, unsigned int end=0, bool quiet=false) |
|
void | decodefile_v1 (std::ifstream *in, std::ostream *out, unsigned int start=0, unsigned int end=0, bool quiet=false) |
|
std::string | decodefiletostring (const std::string &filename, unsigned int start=0, unsigned int end=0, bool quiet=true) |
|
int | size () const |
|
std::string | operator[] (unsigned int key) const |
|
void | add (const unsigned int, const std::string &) |
|
unsigned int | gethighestclass () |
|
bool | hasclass (unsigned int key) const |
|
unsigned int | newclass () |
|
void | prune (unsigned int threshold) |
|
const_iterator | begin () const |
|
const_iterator | end () const |
|
Class for decoding binary class-encoded data back to plain-text. The ClassDecoder maintains a mapping of classes (integers) to words. It allows decoding of a corpus that was losslessly compressed by substituting words for classes. The classes are distributed based on word frequency, with frequent words receiving a lower class number that can be represented in fewer bytes, and rare words receiving a higher class number.
ClassDecoder::ClassDecoder |
( |
| ) |
|
Constructor for an empty class decoder
ClassDecoder::ClassDecoder |
( |
const std::string & |
filename | ) |
|
Constructor for a class decoder loading a class encoding from file
void ClassDecoder::add |
( |
const unsigned int |
cls, |
|
|
const std::string & |
s |
|
) |
| |
Add the class with the given word string to the class encoding
void ClassDecoder::decodefile |
( |
const std::string & |
filename, |
|
|
std::ostream * |
out, |
|
|
unsigned int |
start = 0 , |
|
|
unsigned int |
end = 0 , |
|
|
bool |
quiet = false |
|
) |
| |
Create a plain-text corpus file from a class-encoded corpus file (*.colibri.dat)
- Parameters
-
inputfilename | Filename of the input file, a plain-text corpus file |
out | Output stream for the plain-text corpus data, units (e.g sentences) are delimited with newlines |
start | Start decoding at the specified line (corresponds to sentences or whatever other unit the data employs) |
end | End decoding at the specified line (this line will be included) (corresponds to sentences or whatever other unit the data employs) |
quiet | Do not report decoding problems to stderr |
void ClassDecoder::decodefile_v1 |
( |
std::ifstream * |
in, |
|
|
std::ostream * |
out, |
|
|
unsigned int |
start = 0 , |
|
|
unsigned int |
end = 0 , |
|
|
bool |
quiet = false |
|
) |
| |
std::string ClassDecoder::decodefiletostring |
( |
const std::string & |
filename, |
|
|
unsigned int |
start = 0 , |
|
|
unsigned int |
end = 0 , |
|
|
bool |
quiet = true |
|
) |
| |
Create a plain-text corpus file from a class-encoded corpus file (*.colibri.dat)
- Parameters
-
inputfilename | Filename of the input file, a plain-text corpus file |
start | Start decoding at the specified line (corresponds to sentences or whatever other unit the data employs) |
end | End decoding at the specified line (this line will be included) (corresponds to sentences or whatever other unit the data employs) |
quiet | Do not report decoding problems to stderr |
- Returns
- A string with the plain-text corpus data, units (e.g sentences) are delimited with newlines
vector< string > ClassDecoder::decodeseq |
( |
const std::vector< int > & |
seq | ) |
|
unsigned int ClassDecoder::gethighestclass |
( |
| ) |
|
|
inline |
Return the highest class in the class encoding
bool ClassDecoder::hasclass |
( |
unsigned int |
key | ) |
const |
|
inline |
Test if the specified class exists in this class encoding
void ClassDecoder::load |
( |
const std::string & |
filename | ) |
|
Load a class encoding from file
unsigned int ClassDecoder::newclass |
( |
| ) |
|
Return a new class, not yet assigned
std::string ClassDecoder::operator[] |
( |
unsigned int |
key | ) |
const |
|
inline |
Return the word pertaining to the given class. Unknown classes will be decoded as {?}.
void ClassDecoder::prune |
( |
unsigned int |
threshold | ) |
|
Retain only the specified number of most frequent classes, prune the remainder
int ClassDecoder::size |
( |
| ) |
const |
|
inline |
Return the number of classes, i.e. word types, in the class encoding
const unsigned char ClassDecoder::delimiterclass = 0 |
|
static |
const unsigned char ClassDecoder::flexclass = 4 |
|
static |
const unsigned char ClassDecoder::skipclass = 3 |
|
static |
const unsigned char ClassDecoder::unknownclass = 2 |
|
static |
The documentation for this class was generated from the following files: