Class for reading an entire (class encoded) corpus into memory. It provides a reverse index by IndexReference. The reverse index stores positions and unigrams.
More...
#include <patternstore.h>
|
| IndexedCorpus () |
|
| IndexedCorpus (unsigned char *corpus, unsigned int corpussize) |
|
| IndexedCorpus (std::istream *in, bool debug=false) |
|
| IndexedCorpus (std::string filename, bool debug=false) |
|
| ~IndexedCorpus () |
|
void | load (std::istream *in, bool debug=false) |
|
void | load (std::string filename, bool debug=false) |
|
unsigned char * | getpointer (const IndexReference &begin) const |
|
PatternPointer | getpattern (const IndexReference &begin, int length=1) const |
|
PatternPointer | getpattern () const |
|
unsigned char * | beginpointer () const |
|
unsigned int | bytesize () const |
|
PatternPointer | getsentence (int sentence) const |
|
PatternPointer | getsentence (unsigned char *sentencedata) const |
|
std::vector< IndexReference > | findpattern (const Pattern &pattern, uint32_t sentence=0, int maxmatches=0) |
|
void | findpattern (std::vector< IndexReference > &result, const Pattern &pattern, uint32_t sentence, const PatternPointer &sentencedata, int maxmatches=0) |
|
int | sentencelength (int sentence) const |
|
int | sentencelength (unsigned char *sentencebegin) const |
|
unsigned int | sentences () const |
|
iterator | begin () |
|
iterator | end () |
|
iterator | find (const IndexReference &ref) |
|
bool | has (const IndexReference &ref) const |
|
size_t | size () |
|
bool | empty () const |
|
unsigned int | operator[] (const IndexReference &ref) |
|
Class for reading an entire (class encoded) corpus into memory. It provides a reverse index by IndexReference. The reverse index stores positions and unigrams.
IndexedCorpus::IndexedCorpus |
( |
| ) |
|
|
inline |
IndexedCorpus::IndexedCorpus |
( |
unsigned char * |
corpus, |
|
|
unsigned int |
corpussize |
|
) |
| |
|
inline |
IndexedCorpus::IndexedCorpus |
( |
std::istream * |
in, |
|
|
bool |
debug = false |
|
) |
| |
IndexedCorpus::IndexedCorpus |
( |
std::string |
filename, |
|
|
bool |
debug = false |
|
) |
| |
IndexedCorpus::~IndexedCorpus |
( |
| ) |
|
|
inline |
unsigned char* IndexedCorpus::beginpointer |
( |
| ) |
const |
|
inline |
unsigned int IndexedCorpus::bytesize |
( |
| ) |
const |
|
inline |
bool IndexedCorpus::empty |
( |
| ) |
const |
|
inline |
Returns an iterator starting at the given position. Correspond to end() when no such position is found.
std::vector< IndexReference > IndexedCorpus::findpattern |
( |
const Pattern & |
pattern, |
|
|
uint32_t |
sentence = 0 , |
|
|
int |
maxmatches = 0 |
|
) |
| |
Returns all positions at which the pattern occurs. Up to a certain number of maximum matches if desired. Note that this iterates over the entire corpus and is by far not as efficient as a proper pattern model.
- Parameters
-
sentence | Restrict to a particular sentence (0=all sentences, default) |
void IndexedCorpus::findpattern |
( |
std::vector< IndexReference > & |
result, |
|
|
const Pattern & |
pattern, |
|
|
uint32_t |
sentence, |
|
|
const PatternPointer & |
sentencedata, |
|
|
int |
maxmatches = 0 |
|
) |
| |
Returns a pattern starting at the provided position and of the specified length.
unsigned char * IndexedCorpus::getpointer |
( |
const IndexReference & |
begin | ) |
const |
Low-level function, returns a data pointer given an IndexReference. Returns NULL when the index does not exist. Use getpattern() instead.
Get the sentence (or whatever other unit your data employs) specified by the given index. Sentences start at 1.
PatternPointer IndexedCorpus::getsentence |
( |
unsigned char * |
sentencedata | ) |
const |
Returns a const iterator starting at the given position. Correspond to end() when no such position is found. Does the provided position occur in the corpus?
void IndexedCorpus::load |
( |
std::istream * |
in, |
|
|
bool |
debug = false |
|
) |
| |
void IndexedCorpus::load |
( |
std::string |
filename, |
|
|
bool |
debug = false |
|
) |
| |
Returns the token at the provided position. The token is returned as an integer corresponding to the class in a particular class encoding. Use getpattern() if you want a Pattern instance.
- See also
- getpattern
int IndexedCorpus::sentencelength |
( |
int |
sentence | ) |
const |
Returns the length of the sentence (or whatever other unit your data employs) at the given sentence index (starts at 1)
int IndexedCorpus::sentencelength |
( |
unsigned char * |
sentencebegin | ) |
const |
unsigned int IndexedCorpus::sentences |
( |
| ) |
const |
|
inline |
Return the total number of sentences (or whatever other unit delimites your data) in the corpus.
size_t IndexedCorpus::size |
( |
| ) |
|
|
inline |
Returns the number of tokens in the corpus
unsigned char* IndexedCorpus::corpus |
|
protected |
unsigned int IndexedCorpus::corpussize |
|
protected |
std::map<uint32_t,unsigned char*> IndexedCorpus::sentenceindex |
|
protected |
unsigned int IndexedCorpus::totaltokens |
|
protected |
The documentation for this class was generated from the following files: