Colibri Core
|
A model mapping patterns to values, gigh-level interface. More...
#include <patternmodel.h>
Public Types | |
typedef MapType::iterator | iterator |
typedef MapType::const_iterator | const_iterator |
Public Member Functions | |
PatternModel (IndexedCorpus *corpus=NULL) | |
PatternModel (std::istream *f, PatternModelOptions options, PatternModelInterface *constrainmodel=NULL, IndexedCorpus *corpus=NULL) | |
~PatternModel () | |
PatternModel (const std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL, IndexedCorpus *corpus=NULL) | |
virtual int | getmodeltype () const |
virtual int | getmodelversion () const |
virtual size_t | size () const |
virtual bool | has (const Pattern &pattern) const |
virtual bool | has (const PatternPointer &pattern) const |
virtual void | load (std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL) |
virtual void | load (std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL) |
PatternModelInterface * | getinterface () |
virtual void | train (std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) |
virtual void | train (const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) |
virtual int | computeskipgrams (const PatternPointer &pattern, int mintokens=2, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, std::vector< PatternPointer > *targetcontainer=NULL, const bool exhaustive=false, const int maxskips=3, const bool DEBUG=false) |
virtual int | computeskipgrams (const PatternPointer &pattern, PatternModelOptions &options, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, const bool exhaustive=false) |
virtual std::vector< PatternPointer > | findskipgrams (const PatternPointer &pattern, unsigned int occurrencethreshold=1, int maxskips=3) |
virtual void | trainskipgrams (const PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL) |
void | test (MapType &target, std::istream *in) |
void | write (std::ostream *out) |
void | write (const std::string filename) |
virtual int | maxlength () const |
virtual int | minlength () const |
virtual unsigned int | occurrencecount (const Pattern &pattern) |
virtual unsigned int | occurrencecount (const PatternPointer &pattern) |
virtual ValueType * | getdata (const Pattern &pattern, bool makeifnew=false) |
virtual ValueType * | getdata (const PatternPointer &pattern, bool makeifnew=false) |
virtual unsigned int | types () |
virtual unsigned int | tokens () const |
unsigned char | type () const |
unsigned char | version () const |
void | output (std::ostream *) |
unsigned int | coveragecount (const Pattern &key) |
double | coverage (const Pattern &key) |
std::vector< PatternPointer > | getreverseindex (const IndexReference ref, int occurrencecount=0, int category=0, unsigned int size=0) |
std::vector< std::pair< IndexReference, PatternPointer > > | getreverseindex_bysentence (int sentence) |
std::vector< std::pair< IndexReference, PatternPointer > > | getreverseindex_right (const IndexReference ref) |
std::vector< std::pair< IndexReference, PatternPointer > > | getreverseindex_left (const IndexReference ref) |
void | computestats () |
virtual void | resetstats () |
virtual void | computecoveragestats (int category=0, int n=0) |
unsigned int | totaloccurrencesingroup (int category, int n) |
unsigned int | totalpatternsingroup (int category, int n) |
unsigned int | totalwordtypesingroup (int category, int n) |
unsigned int | totaltokensingroup (int category, int n) |
double | frequency (const Pattern &pattern) |
virtual void | add (const PatternPointer &patternpointer, const IndexReference &ref) |
virtual void | add (const Pattern &pattern, ValueType *value, const IndexReference &ref) |
virtual void | add (const PatternPointer &pattern, ValueType *value, const IndexReference &ref) |
unsigned int | prune (int threshold, int _n=0) |
virtual unsigned int | pruneskipgrams (unsigned int threshold, int minskiptypes=2, int _n=0) |
unsigned int | prunenotinset (const std::unordered_set< Pattern > &s, int _n) |
template<class ValueType2 , class ValueHandler2 , class MapType2 > | |
unsigned int | prunebymodel (PatternModel< ValueType2, ValueHandler2, MapType2 > &secondmodel) |
std::vector< std::pair< Pattern, int > > | getpatterns (const Pattern &pattern) |
virtual void | print (std::ostream *out, ClassDecoder &decoder) |
virtual void | printreverseindex (std::ostream *out, ClassDecoder &decoder) |
void | printmodel (std::ostream *out, ClassDecoder &decoder) |
virtual void | print (std::ostream *out, ClassDecoder &decoder, const PatternType &pattern, bool endline=true) |
void | printpattern (std::ostream *out, ClassDecoder &decoder, const Pattern &pattern, bool endline=true) |
void | histogram (std::map< unsigned int, unsigned int > &hist, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0) |
unsigned int | topthreshold (int amount, int category=0, int size=0) |
void | histogram (std::ostream *OUT, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0) |
void | info (std::ostream *OUT) |
void | report (std::ostream *OUT) |
PatternSet< uint64_t > | extractset (int minlength=1, int maxlength=1) |
virtual void | outputrelations (const Pattern &pattern, ClassDecoder &classdecoder, std::ostream *OUT) |
virtual t_relationmap | getsubchildren (const Pattern &pattern, int=0, int=0, int=0) |
virtual t_relationmap | getsubparents (const Pattern &pattern, int=0, int=0, int=0) |
virtual t_relationmap | gettemplates (const Pattern &pattern, int=0) |
virtual t_relationmap | getinstances (const Pattern &pattern, int=0) |
virtual t_relationmap | getskipcontent (const PatternPointer &pattern) |
virtual t_relationmap | getleftneighbours (const Pattern &pattern, int=0, int=0, int=0, int=0) |
virtual t_relationmap | getrightneighbours (const Pattern &pattern, int=0, int=0, int=0, int=0) |
virtual t_relationmap_double | getnpmi (const Pattern &pattern, double threshold) |
virtual int | computeflexgrams_fromskipgrams () |
virtual int | computeflexgrams_fromcooc () |
virtual void | outputcooc_npmi (std::ostream *OUT, ClassDecoder &classdecoder, double threshold) |
virtual void | outputcooc (std::ostream *OUT, ClassDecoder &classdecoder, double threshold) |
Public Member Functions inherited from PatternModelInterface | |
virtual PatternStoreInterface * | getstoreinterface () |
virtual PatternStoreInterface * | getstoreinterface () |
Public Attributes | |
IndexedCorpus * | reverseindex |
Pointer to the reverse index and corpus data for this model (or NULL) More... | |
bool | reverseindex_internal |
bool | hasskipgrams |
Does this model have skipgrams? More... | |
Protected Member Functions | |
virtual void | postread (const PatternModelOptions options) |
virtual void | posttrain (const PatternModelOptions options) |
Protected Attributes | |
unsigned char | model_type |
unsigned char | model_version |
uint64_t | totaltokens |
Total number of tokens in the original corpus, so INCLUDES TOKENS NOT COVERED BY THE MODEL! More... | |
uint64_t | totaltypes |
Total number of unigram/word types in the original corpus, SO INCLUDING NOT COVERED BY THE MODEL! More... | |
int | maxn |
int | minn |
std::set< int > | cache_categories |
std::set< int > | cache_n |
std::map< int, std::map< int, unsigned int > > | cache_grouptotal |
total occurrences (used for frequency computation, within a group) More... | |
std::map< int, std::map< int, unsigned int > > | cache_grouptotalpatterns |
total distinct patterns per group More... | |
std::map< int, std::map< int, unsigned int > > | cache_grouptotalwordtypes |
total covered word types per group More... | |
std::map< int, std::map< int, unsigned int > > | cache_grouptotaltokens |
total covered tokens per group More... | |
std::map< int, std::vector< uint32_t > > | gapmasks |
pre-computed masks representing possible gap configurations for various pattern lengths More... | |
A model mapping patterns to values, gigh-level interface.
ValueType | The type of Value this model stores |
ValueHandler | A handler class for this type of value |
MapType | The type of container to use |
typedef MapType::const_iterator PatternModel< ValueType, ValueHandler, MapType, PatternType >::const_iterator |
typedef MapType::iterator PatternModel< ValueType, ValueHandler, MapType, PatternType >::iterator |
|
inline |
Begin a new pattern model, optionally pre-setting a reverseindex.
|
inline |
Read a pattern model from an input stream
f | The input stream |
options | Options for reading, these act as filter for the data, allowing you to raise thresholds etc |
constrainmodel | Pointer to another pattern model which should be used to constrain the loading of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
corpus | Pointer to the loaded corpus, used as a reverse index. |
|
inline |
|
inline |
Read a pattern model from file
filename | The input filename |
options | Options for reading, these act as filter for the data, allowing you to raise thresholds etc |
constrainmodel | Pointer to another pattern model which should be used to constrain the loading of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
corpus | Pointer to the loaded corpus, used as a reverse index. |
|
inlinevirtual |
Add a pattern, with a given position, to the model. This is called during training at every time an instance of a pattern is found in the data. This is the high-level version.
pattern | The pattern to add (a patternpointer) |
ref | The position in the corpus where the patterns occurs |
Reimplemented in IndexedPatternPointerModel< MapType >, and PatternPointerModel< ValueType, ValueHandler, MapType >.
|
inlinevirtual |
Add a pattern, with a given position, and a value to the model. This is called during training at every time an instance of a pattern is found in the data. This is the low-level version.
pattern | The pattern to add |
value | A pointer to the value for this pattern, what kind of value depends on the ValueType template parameter. |
ref | The position in the corpus where the patterns occurs |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
|
inlinevirtual |
Compute coverage statistics on the model, will generally be called automatically by methods who use it, and the statistics are cached after computation.
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
|
inlinevirtual |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
Low-level function to compute skipgrams for a given pattern . See higher-level function instead
|
inlinevirtual |
Low-level function to compute skipgrams for a given pattern. See trainskipgrams() instead.
|
inline |
Compute statistics on the model, will generally be called automatically by methods who use it, and the statistics are cached after computation.
|
inline |
Return coverage as a fraction of the total number of tokens in the model. For unindexed models this is a maximal projection rather than exact number.
|
inline |
Returns the coverage count for the given pattern, for unindexed models, the coverage count is a mere maximum projection equal to the product of the occurence count and the size.
|
inline |
Returns a PatternSet containing patterns of the specified length. Patterns are actively reconstructed from patterns in the model, if necessary. So this includes patterns that are not in the model explicitly (i.e, smaller patterns that have been pruned.
|
inlinevirtual |
Returns a vector of all skipgrams that can be extracted from the given pattern
|
inlinevirtual |
Returns the frequency of a pattern within its own group (category and size). For instance, if you pass a bigram you will get the occurence count as a fraction of the total occurrences of bigrams.
Implements PatternModelInterface.
|
inlinevirtual |
Get the value stored for the specified pattern.
makeifnew | Add the pattern with empty value if it does not exist (default: false) |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
|
inline |
Returns a more generic but limited PatternModelInterface instance (polymorphism)
|
inlinevirtual |
|
inlinevirtual |
Returns the type of model (a value from ModelType)
Implements PatternModelInterface.
Reimplemented in IndexedPatternPointerModel< MapType >, PatternPointerModel< ValueType, ValueHandler, MapType >, IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
Returns the version of the model implementation and binary serialisation format
Implements PatternModelInterface.
Reimplemented in IndexedPatternPointerModel< MapType >, PatternPointerModel< ValueType, ValueHandler, MapType >, IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
|
inline |
get all patterns in pattern that occur in the patternmodel as a vector of pairs of Patterns and occurrence count.
|
inline |
Given a position in the corpus , return a vector of all the patterns that cover this position.
ref | The position in the corpus |
occurrencecount | If set above zero, filters to only include patterns occurring above this threshold |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to include only this category. Set to 0 for unfiltered (default) |
size | Set to any value above zero to only include patterns of the specified length. |
|
inline |
Returns pairs of positions and patterns, consisting of all patterns found in the specified sentence (or whatever unit delimites your corpus)
sentence | The sentence index (starts at 1) |
|
inline |
Given a position in the corpus , return a vector of all the positions and patterns (as pairs) that occur to the left of this position
ref | The position in the corpus |
|
inline |
Given a position in the corpus , return a vector of all the positions and patterns (as pairs) that occur to the right of this position
ref | The position in the corpus |
|
inlinevirtual |
|
inlinevirtual |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
|
inlinevirtual |
|
inlinevirtual |
|
inlinevirtual |
Checks whether the given pattern occurs in the model
Implements PatternStoreInterface.
|
inlinevirtual |
Does the pattern occur in the pattern store?
Implements PatternStoreInterface.
|
inline |
Generate a histogram for the occurrence count of patterns
hist | This will contain the to-be-computed histogram |
threshold | Include only patterns at or above this occurrence threshold |
cap | Include only this many of the top frequencies (0=unconstrained) |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to filter or to 0 to cover all |
size | Set to any value above zero to only include only patterns of the specified length. (0 for all sizes) |
|
inline |
Generate a histogram for the occurrence count of patterns and output it to the output stream.
OUT | the output stream |
threshold | Include only patterns at or above this occurrence threshold |
cap | Include only this many of the top frequencies (0=unconstrained) |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to filter or to 0 to cover all |
size | Set to any value above zero to only include only patterns of the specified length. (0 for all sizes) |
|
inline |
Output information about the model to the output stream, includes some statistics and technical details such as space requirements.
|
inlinevirtual |
Read a pattern model from file
filename | The input filename |
options | Options for reading, these act as filter for the data, allowing you to raise thresholds etc |
constrainmodel | Pointer to another pattern model which should be used to constrain the loading of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
|
inlinevirtual |
Read a pattern model from an input stream
f | The input stream |
options | Options for reading, these act as filter for the data, allowing you to raise thresholds etc |
constrainmodel | Pointer to another pattern model which should be used to constrain the loading of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
|
inlinevirtual |
Returns the maximum length of patterns in this model
Implements PatternModelInterface.
|
inlinevirtual |
Returns the minimum length of patterns in this model
Implements PatternModelInterface.
|
inlinevirtual |
Returns the occurrenc count of the specified pattern, will return 0 if it does not exist in the model
Implements PatternModelInterface.
|
inlinevirtual |
void PatternModel< ValueType, ValueHandler, MapType, PatternType >::output | ( | std::ostream * | ) |
|
inlinevirtual |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
|
inlineprotectedvirtual |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlineprotectedvirtual |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
Print the contents of the pattern model, i.e. all patterns and associated counts, to the output stream.
out | The output stream |
decoder | The class decoder to use |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
Print for one pattern only.
out | The output stream |
decoder | The class decoder to use |
Reimplemented in IndexedPatternModel< MapType, PatternPointer >.
|
inline |
Just an alias for print()
|
inline |
Alias for per-pattern print()
|
inlinevirtual |
Print the full reverse index, a mapping of indices and all patterns that occur at those positions.
out | The output stream |
decoder | The class decoder to use |
|
inline |
Prune all patterns under the specified occurrence threshold (or -1 for all). Pruning can be limited to patterns of a particular size only.
threshold | The occurrence threshold (set to -1 to prune everything) |
_n | The size constraint, limit to patterns of this size only (set to 0 for no constraint, default) |
|
inline |
Prune all patterns that are not in the second model
|
inline |
Prune all patterns that are not in the specified set.
s | The set containing the patterns not to prune |
_n | The size constraint, limit to patterns of this size only (set to 0 for no constraint, default) |
|
inlinevirtual |
Prune all skipgrams under the specified occurrence threshold (or -1 for all). Pruning can be limited to patterns of a particular size only.
threshold | The occurrence threshold (set to -1 to prune everything) |
_n | The size constraint, limit to patterns of this size only (set to 0 for no constraint, default) |
|
inline |
Output an elaborate statistical report to the output stream. Computes on first call when necessary.
|
inlinevirtual |
|
inlinevirtual |
Returns the number of distinct patterns in the model
Implements PatternStoreInterface.
void PatternModel< ValueType, ValueHandler, MapType, PatternType >::test | ( | MapType & | target, |
std::istream * | in | ||
) |
|
inlinevirtual |
Return the total amount of word/unigram tokens in the model
Implements PatternModelInterface.
|
inline |
|
inline |
Obtains statistics of the model: returns the total amount of occurrences within the specified group, the group consist of a category and a size.
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) or to 0 to cover all |
n | Set to any value above zero to only cover only patterns of the specified length. (0 for all sizes) |
|
inline |
Obtains statistics of the model: returns the total amount of distinct patterns within the specified group, the group consist of a category and a size.
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) or to 0 to cover all |
n | Set to any value above zero to only cover only patterns of the specified length. (0 for all sizes) |
|
inline |
Obtains statistics of the model: returns the total amount of covered tokens within the specified group, the group consist of a category and a size.
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) or to 0 to cover all |
n | Set to any value above zero to only cover only patterns of the specified length. (0 for all sizes) |
|
inline |
Obtains statistics of the model: returns the total amount of word/unigtams types within the specified group, the group consist of a category and a size.
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) or to 0 to cover all |
n | Set to any value above zero to only cover only patterns of the specified length. (0 for all sizes) |
|
inlinevirtual |
Train a pattern model on corpus data (given an input stream)
in | The input stream of the corpus data (*.colibri.dat), may be NULL if a reverse index is loaded. |
options | Options for training |
constrainbymodel | Pointer to another pattern model which should be used to constrain the training of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
continued | Continued training on the same corpus data |
firstsentence | First sentence index, useful for augmenting a model with another corpus (keep continued set to false in this case), defaults to 1 |
ignoreerrors | Try to ignore errors (use for debug only) |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
Train a pattern model on corpus data
filename | The filename of the corpus data (*.colibri.dat) |
options | Options for training |
constrainbymodel | Pointer to another pattern model which should be used to constrain the training of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inlinevirtual |
Train skipgrams, for indexed models only
Reimplemented in IndexedPatternModel< MapType, PatternType >, and IndexedPatternModel< MapType, PatternPointer >.
|
inline |
|
inlinevirtual |
Return the total amount of word/unigram types in the model
Implements PatternModelInterface.
|
inline |
|
inline |
Write the pattern model to output stream
|
inline |
Save the entire pattern model to file
|
protected |
|
protected |
total occurrences (used for frequency computation, within a group)
|
protected |
total distinct patterns per group
|
protected |
total covered tokens per group
|
protected |
total covered word types per group
|
protected |
|
protected |
pre-computed masks representing possible gap configurations for various pattern lengths
bool PatternModel< ValueType, ValueHandler, MapType, PatternType >::hasskipgrams |
Does this model have skipgrams?
|
protected |
|
protected |
|
protected |
|
protected |
IndexedCorpus* PatternModel< ValueType, ValueHandler, MapType, PatternType >::reverseindex |
Pointer to the reverse index and corpus data for this model (or NULL)
bool PatternModel< ValueType, ValueHandler, MapType, PatternType >::reverseindex_internal |
|
protected |
Total number of tokens in the original corpus, so INCLUDES TOKENS NOT COVERED BY THE MODEL!
|
protected |
Total number of unigram/word types in the original corpus, SO INCLUDING NOT COVERED BY THE MODEL!