Colibri Core
|
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion. Allows numerous operations. More...
#include <pattern.h>
Public Member Functions | |
Pattern () | |
Pattern (const unsigned char *dataref, const int size) | |
Pattern (const Pattern &ref, unsigned int begin, unsigned int length) | |
Pattern (const PatternPointer &ref, unsigned int begin, unsigned int length) | |
Pattern (const Pattern &ref) | |
Pattern (const PatternPointer &ref) | |
Pattern (std::istream *in, bool ignoreeol=false, const unsigned char version=2, const unsigned char *corpusstart=NULL, bool debug=false) | |
~Pattern () | |
Pattern (int size) | |
void | write (std::ostream *out, const unsigned char *corpusstart=NULL) const |
const size_t | n () const |
const size_t | bytesize () const |
const size_t | size () const |
const unsigned int | skipcount () const |
const PatternCategory | category () const |
const bool | isskipgram () const |
const bool | isflexgram () const |
const bool | unknown () const |
Pattern | operator[] (int index) const |
const size_t | hash () const |
std::string | tostring (const ClassDecoder &classdecoder) const |
std::string | decode (const ClassDecoder &classdecoder) const |
bool | out () const |
std::vector< unsigned int > | tovector () const |
bool | operator== (const Pattern &other) const |
bool | operator!= (const Pattern &other) const |
bool | operator== (const PatternPointer &other) const |
bool | operator!= (const PatternPointer &other) const |
void | operator= (const Pattern &other) |
bool | operator< (const Pattern &other) const |
bool | operator> (const Pattern &other) const |
Pattern | operator+ (const Pattern &) const |
PatternPointer | getpointer () const |
int | find (const Pattern &subpattern) const |
bool | contains (const Pattern &subpattern) const |
bool | instanceof (const Pattern &skipgram) const |
int | ngrams (std::vector< Pattern > &container, const int n) const |
int | ngrams (std::vector< PatternPointer > &container, const int n) const |
int | subngrams (std::vector< Pattern > &container, int minn=1, int maxn=99) const |
int | subngrams (std::vector< PatternPointer > &container, int minn=1, int maxn=99) const |
int | ngrams (std::vector< std::pair< Pattern, int >> &container, const int n) const |
int | ngrams (std::vector< std::pair< PatternPointer, int >> &container, const int n) const |
int | subngrams (std::vector< std::pair< Pattern, int >> &container, int minn=1, int maxn=9) const |
int | subngrams (std::vector< std::pair< PatternPointer, int >> &container, int minn=1, int maxn=9) const |
int | parts (std::vector< Pattern > &container) const |
int | parts (std::vector< PatternPointer > &container) const |
int | parts (std::vector< std::pair< int, int > > &container) const |
int | gaps (std::vector< std::pair< int, int > > &container) const |
Pattern | extractskipcontent (const Pattern &instance) const |
Pattern | replace (int begin, int length, const Pattern &replacement) const |
Pattern | addskip (const std::pair< int, int > &gap) const |
Pattern | addskips (const std::vector< std::pair< int, int > > &gaps) const |
Pattern | addflexgaps (const std::vector< std::pair< int, int > > &gaps) const |
Pattern | reverse () const |
Pattern | toflexgram () const |
bool | isgap (int i) const |
Pattern | addcontext (const Pattern &leftcontext, const Pattern &rightcontext) const |
void | mask (std::vector< bool > &container) const |
void | set (const unsigned char *dataref, const int size) |
Public Attributes | |
unsigned char * | data |
Static Public Attributes | |
static const int | patterntype = PATTERN |
Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion. Allows numerous operations.
|
inline |
Default/empty Pattern constructor. Creates an empty pattern. Still consumes one byte (the end-marker)
Pattern::Pattern | ( | const unsigned char * | dataref, |
const int | size | ||
) |
Low-level pattern constructor from character array. The size is in bytes and never includes the end-marker.
dataref | Reference data, must be properly class-encoded |
size | The size (without \0 end marker!) to copy from dataref |
Pattern::Pattern | ( | const Pattern & | ref, |
unsigned int | begin, | ||
unsigned int | length | ||
) |
Slice constructor for Pattern
ref | Reference pattern |
begin | Index of the first token to copy (0-indexed) |
length | Number of tokens to copy |
Pattern::Pattern | ( | const PatternPointer & | ref, |
unsigned int | begin, | ||
unsigned int | length | ||
) |
Pattern::Pattern | ( | const Pattern & | ref | ) |
Copy constructor for Pattern
ref | Reference pattern |
Pattern::Pattern | ( | const PatternPointer & | ref | ) |
Pattern::Pattern | ( | std::istream * | in, |
bool | ignoreeol = false , |
||
const unsigned char | version = 2 , |
||
const unsigned char * | corpusstart = NULL , |
||
bool | debug = false |
||
) |
Read Pattern from input stream (in binary form)
in | The input stream |
ignoreeol | Ignore end of line markers and read on until the end of the file, storing corpus data in one pattern |
version | Version of file format (default: 2) |
corpusoffset | not used |
Pattern::~Pattern | ( | ) |
|
inline |
Pattern constructor consisting of only a fixed-size gap
size | The size of the gap |
Pattern Pattern::addflexgaps | ( | const std::vector< std::pair< int, int > > & | gaps | ) | const |
Replaces multiple series of tokens with skips/gaps of undefined variable size. Effectively turns a pattern into a flexgram.
gaps | The positions and sizes of the gaps: a vector of pairs, each pair consisting of a begin index (0-indexed) and a length, indicating where to place the gap |
Pattern Pattern::addskip | ( | const std::pair< int, int > & | gap | ) | const |
Replaces a series of tokens with a skip/gap of a particular size. Effectively turns a pattern into a skipgram.
gap | The position and size of the skip/gap: a pair consisting of a begin index (0-indexed) and a length, i.e. the size of the skip |
Pattern Pattern::addskips | ( | const std::vector< std::pair< int, int > > & | gaps | ) | const |
Replaces multiple series of tokens with skips/gaps of particular sizes. Effectively turns a pattern into a skipgram.
gaps | The positions and sizes of the gaps: a vector of pairs, each pair consisting of a begin index (0-indexed) and a length, indicating where to place the gap |
const size_t Pattern::bytesize | ( | ) | const |
return the size of the pattern (in bytes), this does not include the final \0 end-marker.
const PatternCategory Pattern::category | ( | ) | const |
Returns the category of this pattern (value from enum PatternCategory)
bool Pattern::contains | ( | const Pattern & | subpattern | ) | const |
Test whether the pattern contains the specified subpattern.
|
inline |
alias for tostring()
Given a skipgram and an ngram instantation of it (i.e, both of the same length), extract a pattern from the instance that would fill the gaps. Raise an exception if the instance can not be matched with the skipgram
int Pattern::find | ( | const Pattern & | subpattern | ) | const |
Finds the specified subpattern in the this pattern. Returns the index at which it is found, or -1 if it is not found at all.
int Pattern::gaps | ( | std::vector< std::pair< int, int > > & | container | ) | const |
Finds all the gaps of a skipgram or flexgram., parts are the portions that are not skips and adds them to container as begin,length pairs... Thus 'to be {*} not {*} be' has three parts. The gap-length of a flexgram will always be its minimum length one.
PatternPointer Pattern::getpointer | ( | ) | const |
const size_t Pattern::hash | ( | ) | const |
Compute a hash value for this pattern
bool Pattern::instanceof | ( | const Pattern & | skipgram | ) | const |
Tests whether the pattern is an instantiation of the specified skipgram
|
inline |
bool Pattern::isgap | ( | int | i | ) | const |
Is the word at the specified index (0 indexed) a gap?
|
inline |
void Pattern::mask | ( | std::vector< bool > & | container | ) | const |
const size_t Pattern::n | ( | ) | const |
return the size of the pattern in tokens (will count flex gaps gaps as size 1)
int Pattern::ngrams | ( | std::vector< Pattern > & | container, |
const int | n | ||
) | const |
Adds all patterns (not just ngrams) of size n that are contained within the pattern to container. Does not extract skipgrams that are not directly present in the pattern.
int Pattern::ngrams | ( | std::vector< PatternPointer > & | container, |
const int | n | ||
) | const |
int Pattern::ngrams | ( | std::vector< std::pair< Pattern, int >> & | container, |
const int | n | ||
) | const |
Adds all pairs of all patterns (not just ngrams) of size n that are contained within the pattern, with the token offset at which they were found, to container. Does not extract skipgrams that are not directly present in the pattern.
int Pattern::ngrams | ( | std::vector< std::pair< PatternPointer, int >> & | container, |
const int | n | ||
) | const |
bool Pattern::operator!= | ( | const Pattern & | other | ) | const |
bool Pattern::operator!= | ( | const PatternPointer & | other | ) | const |
bool Pattern::operator< | ( | const Pattern & | other | ) | const |
Patterns can be sorted, note however that the sorting is based on the frequencies of the tokens and is not alphanumerical!
void Pattern::operator= | ( | const Pattern & | other | ) |
Assignment operator
bool Pattern::operator== | ( | const Pattern & | other | ) | const |
bool Pattern::operator== | ( | const PatternPointer & | other | ) | const |
bool Pattern::operator> | ( | const Pattern & | other | ) | const |
|
inline |
Return a single token (not a byte!). index < size().
bool Pattern::out | ( | ) | const |
Debug function outputting the classes in this pattern to stderr
int Pattern::parts | ( | std::vector< Pattern > & | container | ) | const |
Finds all the parts of a skipgram, parts are the portions that are not skips and adds them to container... Thus 'to be {*} not {*} be' has three parts
int Pattern::parts | ( | std::vector< PatternPointer > & | container | ) | const |
int Pattern::parts | ( | std::vector< std::pair< int, int > > & | container | ) | const |
Finds all the parts of a skipgram, parts are the portions that are not skips and adds them to container as begin,length pairs... Thus 'to be {*} not {*} be' has three parts
Replace the tokens from begin (0-indexed), up to the specified length, with a replacement pattern (of any length)
Pattern Pattern::reverse | ( | ) | const |
Returns a pattern with the tokens in reverse order
void Pattern::set | ( | const unsigned char * | dataref, |
const int | size | ||
) |
|
inline |
return the size of the pattern in tokens (will count flex gaps gaps as size 1)
const unsigned int Pattern::skipcount | ( | ) | const |
return the number of skips in this pattern
int Pattern::subngrams | ( | std::vector< Pattern > & | container, |
int | minn = 1 , |
||
int | maxn = 99 |
||
) | const |
Adds all patterns (not just ngrams) of all sizes that are contained within the pattern to container. Does not extract skipgrams that are not directly present in the pattern. Also returns the full ngram itself by default. Set maxn and minn to constrain.
int Pattern::subngrams | ( | std::vector< PatternPointer > & | container, |
int | minn = 1 , |
||
int | maxn = 99 |
||
) | const |
int Pattern::subngrams | ( | std::vector< std::pair< Pattern, int >> & | container, |
int | minn = 1 , |
||
int | maxn = 9 |
||
) | const |
Adds all pairs of all patterns (not just ngrams) that are contained within the pattern, with the token offset at which they were found, to container. Does not extract skipgrams that are not directly present in the pattern.
int Pattern::subngrams | ( | std::vector< std::pair< PatternPointer, int >> & | container, |
int | minn = 1 , |
||
int | maxn = 9 |
||
) | const |
Pattern Pattern::toflexgram | ( | ) | const |
converts a skipgram into a flexgram (ngrams just come out unchanged)
std::string Pattern::tostring | ( | const ClassDecoder & | classdecoder | ) | const |
Converts this pattern back into its string representation, using a classdecoder
vector< unsigned int > Pattern::tovector | ( | ) | const |
Convert the pattern to a vector of integers, where the integers correspond to the token classes.
const bool Pattern::unknown | ( | ) | const |
void Pattern::write | ( | std::ostream * | out, |
const unsigned char * | corpusstart = NULL |
||
) | const |
Write Pattern to output stream (in binary form)
out | The output stream |
unsigned char* Pattern::data |
This array holds the variable-width byte representation, it is always terminated by \0 (ENDMARKER). Though public, you usually do not want to access it directly
|
static |