Colibri Core
Public Member Functions | Public Attributes | Static Public Attributes | List of all members
Pattern Class Reference

Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion. Allows numerous operations. More...

#include <pattern.h>

Public Member Functions

 Pattern ()
 
 Pattern (const unsigned char *dataref, const int size)
 
 Pattern (const Pattern &ref, unsigned int begin, unsigned int length)
 
 Pattern (const PatternPointer &ref, unsigned int begin, unsigned int length)
 
 Pattern (const Pattern &ref)
 
 Pattern (const PatternPointer &ref)
 
 Pattern (std::istream *in, bool ignoreeol=false, const unsigned char version=2, const unsigned char *corpusstart=NULL, bool debug=false)
 
 ~Pattern ()
 
 Pattern (int size)
 
void write (std::ostream *out, const unsigned char *corpusstart=NULL) const
 
const size_t n () const
 
const size_t bytesize () const
 
const size_t size () const
 
const unsigned int skipcount () const
 
const PatternCategory category () const
 
const bool isskipgram () const
 
const bool isflexgram () const
 
const bool unknown () const
 
Pattern operator[] (int index) const
 
const size_t hash () const
 
std::string tostring (const ClassDecoder &classdecoder) const
 
std::string decode (const ClassDecoder &classdecoder) const
 
bool out () const
 
std::vector< unsigned int > tovector () const
 
bool operator== (const Pattern &other) const
 
bool operator!= (const Pattern &other) const
 
bool operator== (const PatternPointer &other) const
 
bool operator!= (const PatternPointer &other) const
 
void operator= (const Pattern &other)
 
bool operator< (const Pattern &other) const
 
bool operator> (const Pattern &other) const
 
Pattern operator+ (const Pattern &) const
 
PatternPointer getpointer () const
 
int find (const Pattern &subpattern) const
 
bool contains (const Pattern &subpattern) const
 
bool instanceof (const Pattern &skipgram) const
 
int ngrams (std::vector< Pattern > &container, const int n) const
 
int ngrams (std::vector< PatternPointer > &container, const int n) const
 
int subngrams (std::vector< Pattern > &container, int minn=1, int maxn=99) const
 
int subngrams (std::vector< PatternPointer > &container, int minn=1, int maxn=99) const
 
int ngrams (std::vector< std::pair< Pattern, int >> &container, const int n) const
 
int ngrams (std::vector< std::pair< PatternPointer, int >> &container, const int n) const
 
int subngrams (std::vector< std::pair< Pattern, int >> &container, int minn=1, int maxn=9) const
 
int subngrams (std::vector< std::pair< PatternPointer, int >> &container, int minn=1, int maxn=9) const
 
int parts (std::vector< Pattern > &container) const
 
int parts (std::vector< PatternPointer > &container) const
 
int parts (std::vector< std::pair< int, int > > &container) const
 
int gaps (std::vector< std::pair< int, int > > &container) const
 
Pattern extractskipcontent (const Pattern &instance) const
 
Pattern replace (int begin, int length, const Pattern &replacement) const
 
Pattern addskip (const std::pair< int, int > &gap) const
 
Pattern addskips (const std::vector< std::pair< int, int > > &gaps) const
 
Pattern addflexgaps (const std::vector< std::pair< int, int > > &gaps) const
 
Pattern reverse () const
 
Pattern toflexgram () const
 
bool isgap (int i) const
 
Pattern addcontext (const Pattern &leftcontext, const Pattern &rightcontext) const
 
void mask (std::vector< bool > &container) const
 
void set (const unsigned char *dataref, const int size)
 

Public Attributes

unsigned char * data
 

Static Public Attributes

static const int patterntype = PATTERN
 

Detailed Description

Pattern class, represents a pattern (ngram, skipgram or flexgram). Encoded in a memory-saving fashion. Allows numerous operations.

Constructor & Destructor Documentation

Pattern::Pattern ( )
inline

Default/empty Pattern constructor. Creates an empty pattern. Still consumes one byte (the end-marker)

Pattern::Pattern ( const unsigned char *  dataref,
const int  size 
)

Low-level pattern constructor from character array. The size is in bytes and never includes the end-marker.

Parameters
datarefReference data, must be properly class-encoded
sizeThe size (without \0 end marker!) to copy from dataref
Pattern::Pattern ( const Pattern ref,
unsigned int  begin,
unsigned int  length 
)

Slice constructor for Pattern

Parameters
refReference pattern
beginIndex of the first token to copy (0-indexed)
lengthNumber of tokens to copy
Pattern::Pattern ( const PatternPointer ref,
unsigned int  begin,
unsigned int  length 
)
Pattern::Pattern ( const Pattern ref)

Copy constructor for Pattern

Parameters
refReference pattern
Pattern::Pattern ( const PatternPointer ref)
Pattern::Pattern ( std::istream *  in,
bool  ignoreeol = false,
const unsigned char  version = 2,
const unsigned char *  corpusstart = NULL,
bool  debug = false 
)

Read Pattern from input stream (in binary form)

Parameters
inThe input stream
ignoreeolIgnore end of line markers and read on until the end of the file, storing corpus data in one pattern
versionVersion of file format (default: 2)
corpusoffsetnot used
Pattern::~Pattern ( )
Pattern::Pattern ( int  size)
inline

Pattern constructor consisting of only a fixed-size gap

Parameters
sizeThe size of the gap

Member Function Documentation

Pattern Pattern::addcontext ( const Pattern leftcontext,
const Pattern rightcontext 
) const
Pattern Pattern::addflexgaps ( const std::vector< std::pair< int, int > > &  gaps) const

Replaces multiple series of tokens with skips/gaps of undefined variable size. Effectively turns a pattern into a flexgram.

Parameters
gapsThe positions and sizes of the gaps: a vector of pairs, each pair consisting of a begin index (0-indexed) and a length, indicating where to place the gap
Returns
A flexgram
Pattern Pattern::addskip ( const std::pair< int, int > &  gap) const

Replaces a series of tokens with a skip/gap of a particular size. Effectively turns a pattern into a skipgram.

Parameters
gapThe position and size of the skip/gap: a pair consisting of a begin index (0-indexed) and a length, i.e. the size of the skip
Pattern Pattern::addskips ( const std::vector< std::pair< int, int > > &  gaps) const

Replaces multiple series of tokens with skips/gaps of particular sizes. Effectively turns a pattern into a skipgram.

Parameters
gapsThe positions and sizes of the gaps: a vector of pairs, each pair consisting of a begin index (0-indexed) and a length, indicating where to place the gap
Returns
A skipgram
const size_t Pattern::bytesize ( ) const

return the size of the pattern (in bytes), this does not include the final \0 end-marker.

const PatternCategory Pattern::category ( ) const

Returns the category of this pattern (value from enum PatternCategory)

bool Pattern::contains ( const Pattern subpattern) const

Test whether the pattern contains the specified subpattern.

std::string Pattern::decode ( const ClassDecoder classdecoder) const
inline

alias for tostring()

Pattern Pattern::extractskipcontent ( const Pattern instance) const

Given a skipgram and an ngram instantation of it (i.e, both of the same length), extract a pattern from the instance that would fill the gaps. Raise an exception if the instance can not be matched with the skipgram

int Pattern::find ( const Pattern subpattern) const

Finds the specified subpattern in the this pattern. Returns the index at which it is found, or -1 if it is not found at all.

int Pattern::gaps ( std::vector< std::pair< int, int > > &  container) const

Finds all the gaps of a skipgram or flexgram., parts are the portions that are not skips and adds them to container as begin,length pairs... Thus 'to be {*} not {*} be' has three parts. The gap-length of a flexgram will always be its minimum length one.

PatternPointer Pattern::getpointer ( ) const
const size_t Pattern::hash ( ) const

Compute a hash value for this pattern

bool Pattern::instanceof ( const Pattern skipgram) const

Tests whether the pattern is an instantiation of the specified skipgram

const bool Pattern::isflexgram ( ) const
inline
bool Pattern::isgap ( int  i) const

Is the word at the specified index (0 indexed) a gap?

const bool Pattern::isskipgram ( ) const
inline
void Pattern::mask ( std::vector< bool > &  container) const
const size_t Pattern::n ( ) const

return the size of the pattern in tokens (will count flex gaps gaps as size 1)

int Pattern::ngrams ( std::vector< Pattern > &  container,
const int  n 
) const

Adds all patterns (not just ngrams) of size n that are contained within the pattern to container. Does not extract skipgrams that are not directly present in the pattern.

int Pattern::ngrams ( std::vector< PatternPointer > &  container,
const int  n 
) const
int Pattern::ngrams ( std::vector< std::pair< Pattern, int >> &  container,
const int  n 
) const

Adds all pairs of all patterns (not just ngrams) of size n that are contained within the pattern, with the token offset at which they were found, to container. Does not extract skipgrams that are not directly present in the pattern.

int Pattern::ngrams ( std::vector< std::pair< PatternPointer, int >> &  container,
const int  n 
) const
bool Pattern::operator!= ( const Pattern other) const
bool Pattern::operator!= ( const PatternPointer other) const
Pattern Pattern::operator+ ( const Pattern other) const
bool Pattern::operator< ( const Pattern other) const

Patterns can be sorted, note however that the sorting is based on the frequencies of the tokens and is not alphanumerical!

void Pattern::operator= ( const Pattern other)

Assignment operator

bool Pattern::operator== ( const Pattern other) const
bool Pattern::operator== ( const PatternPointer other) const
bool Pattern::operator> ( const Pattern other) const
Pattern Pattern::operator[] ( int  index) const
inline

Return a single token (not a byte!). index < size().

bool Pattern::out ( ) const

Debug function outputting the classes in this pattern to stderr

int Pattern::parts ( std::vector< Pattern > &  container) const

Finds all the parts of a skipgram, parts are the portions that are not skips and adds them to container... Thus 'to be {*} not {*} be' has three parts

int Pattern::parts ( std::vector< PatternPointer > &  container) const
int Pattern::parts ( std::vector< std::pair< int, int > > &  container) const

Finds all the parts of a skipgram, parts are the portions that are not skips and adds them to container as begin,length pairs... Thus 'to be {*} not {*} be' has three parts

Pattern Pattern::replace ( int  begin,
int  length,
const Pattern replacement 
) const

Replace the tokens from begin (0-indexed), up to the specified length, with a replacement pattern (of any length)

Pattern Pattern::reverse ( ) const

Returns a pattern with the tokens in reverse order

void Pattern::set ( const unsigned char *  dataref,
const int  size 
)
const size_t Pattern::size ( ) const
inline

return the size of the pattern in tokens (will count flex gaps gaps as size 1)

const unsigned int Pattern::skipcount ( ) const

return the number of skips in this pattern

int Pattern::subngrams ( std::vector< Pattern > &  container,
int  minn = 1,
int  maxn = 99 
) const

Adds all patterns (not just ngrams) of all sizes that are contained within the pattern to container. Does not extract skipgrams that are not directly present in the pattern. Also returns the full ngram itself by default. Set maxn and minn to constrain.

int Pattern::subngrams ( std::vector< PatternPointer > &  container,
int  minn = 1,
int  maxn = 99 
) const
int Pattern::subngrams ( std::vector< std::pair< Pattern, int >> &  container,
int  minn = 1,
int  maxn = 9 
) const

Adds all pairs of all patterns (not just ngrams) that are contained within the pattern, with the token offset at which they were found, to container. Does not extract skipgrams that are not directly present in the pattern.

int Pattern::subngrams ( std::vector< std::pair< PatternPointer, int >> &  container,
int  minn = 1,
int  maxn = 9 
) const
Pattern Pattern::toflexgram ( ) const

converts a skipgram into a flexgram (ngrams just come out unchanged)

std::string Pattern::tostring ( const ClassDecoder classdecoder) const

Converts this pattern back into its string representation, using a classdecoder

vector< unsigned int > Pattern::tovector ( ) const

Convert the pattern to a vector of integers, where the integers correspond to the token classes.

const bool Pattern::unknown ( ) const
void Pattern::write ( std::ostream *  out,
const unsigned char *  corpusstart = NULL 
) const

Write Pattern to output stream (in binary form)

Parameters
outThe output stream

Member Data Documentation

unsigned char* Pattern::data

This array holds the variable-width byte representation, it is always terminated by \0 (ENDMARKER). Though public, you usually do not want to access it directly

const int Pattern::patterntype = PATTERN
static

The documentation for this class was generated from the following files: