Class for encoding plain-text to binary class-encoded data.
More...
#include <unordered_map>
#include <string>
#include <vector>
#include <fstream>
#include <pattern.h>
#include <common.h>
Go to the source code of this file.
|
class | ClassEncoder |
| Class for encoding plain-text to binary class-encoded data. The ClassEncoder maintains a mapping of words to classes (integers). It allows a corpus to be losslessly compressed by substituting words for classes. The classes are distributed based on word frequency, with frequent words receiving a lower class number that can be represented in fewer bytes, and rare words receiving a higher class number. More...
|
|
|
unsigned int | inttobytes (unsigned char *buffer, unsigned int cls) |
|
unsigned char * | inttobytes_v1 (unsigned int, int &length) |
|
int | readline (std::istream *IN, unsigned char *buffer, const int) |
|
unsigned char * | convert_v1_v2 (const unsigned char *olddata, unsigned int &newlength) |
|
unsigned char * | convert_v1_v2 (std::istream *in, bool ignoreeol, bool debug) |
|
const int | countwords (const unsigned char *data, const int l) |
|
Class for encoding plain-text to binary class-encoded data.
- Author
- Maarten van Gompel (proycon) proyc.nosp@m.on@a.nosp@m.napro.nosp@m.y.nl
LICENSE
Licensed under GPLv3
DESCRIPTION
Class for encoding plain-text to binary class-encoded data
unsigned char* convert_v1_v2 |
( |
const unsigned char * |
olddata, |
|
|
unsigned int & |
newlength |
|
) |
| |
unsigned char* convert_v1_v2 |
( |
std::istream * |
in, |
|
|
bool |
ignoreeol, |
|
|
bool |
debug |
|
) |
| |
const int countwords |
( |
const unsigned char * |
data, |
|
|
const int |
l |
|
) |
| |
unsigned int inttobytes |
( |
unsigned char * |
buffer, |
|
|
unsigned int |
cls |
|
) |
| |
unsigned char* inttobytes_v1 |
( |
unsigned |
int, |
|
|
int & |
length |
|
) |
| |
int readline |
( |
std::istream * |
IN, |
|
|
unsigned char * |
buffer, |
|
|
const int |
|
|
) |
| |