Colibri Core
|
An indexed model mapping patterns to values, high-level interface. This is a specialised subclass of PatternMap. More...
#include <patternmodel.h>
Public Member Functions | |
IndexedPatternModel (IndexedCorpus *corpus=NULL) | |
IndexedPatternModel (std::istream *f, const PatternModelOptions options, PatternModelInterface *constrainmodel=NULL, IndexedCorpus *corpus=NULL) | |
IndexedPatternModel (const std::string filename, const PatternModelOptions options, PatternModelInterface *constrainmodel=NULL, IndexedCorpus *corpus=NULL) | |
virtual | ~IndexedPatternModel () |
int | getmodeltype () const |
int | getmodelversion () const |
virtual void | add (const Pattern &pattern, IndexedData *value, const IndexReference &ref) |
virtual void | add (const PatternPointer &patternpointer, IndexedData *value, const IndexReference &ref) |
IndexedData * | getdata (const Pattern &pattern, bool makeifnew=false) |
IndexedData * | getdata (const PatternPointer &pattern, bool makeifnew=false) |
virtual void | train (std::istream *in, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) |
virtual void | train (const std::string &filename, PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL, bool continued=false, uint32_t firstsentence=1, bool ignoreerrors=false) |
void | info (std::ostream *OUT) |
void | print (std::ostream *out, ClassDecoder &decoder) |
void | print (std::ostream *out, ClassDecoder &decoder, const PatternPointer &pattern, bool endline=true) |
virtual void | trainskipgrams (PatternModelOptions options, PatternModelInterface *constrainbymodel=NULL) |
Pattern | getpatternfromtoken (IndexReference ref) |
t_relationmap | getskipcontent (const PatternPointer &pattern) |
void | prunerelations (t_relationmap &relations, unsigned int occurrencethreshold) |
t_relationmap | gettemplates (const Pattern &pattern, unsigned int occurrencethreshold=0) |
t_relationmap | getinstances (const Pattern &pattern, unsigned int occurrencethreshold=0) |
t_relationmap | getsubchildren (const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0) |
t_relationmap | getsubparents (const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0) |
t_relationmap | getleftneighbours (const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0) |
t_relationmap | getrightneighbours (const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, unsigned int cutoff=0) |
int | pruneskipgrams (int threshold, int minskiptypes, int _n=0) |
virtual void | computecoveragestats (int category=0, int n=0) |
t_relationmap | getrightcooc (const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, IndexedData *matches=NULL) |
t_relationmap | getleftcooc (const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0) |
t_relationmap | getcooc (const PatternPointer &pattern, unsigned int occurrencethreshold=0, int category=0, unsigned int size=0, bool ordersignificant=false) |
double | npmi (const PatternPointer &key1, const PatternPointer &key2, int jointcount) |
void | outputrelations (const PatternPointer &pattern, t_relationmap &relations, ClassDecoder &classdecoder, std::ostream *OUT, const std::string label="RELATED-TO") |
void | outputrelations (const PatternPointer &pattern, ClassDecoder &classdecoder, std::ostream *OUT, bool outputheader=true) |
void | computenpmi (std::map< PatternPointer, t_relationmap_double > &coocmap, double threshold, bool right=true, bool left=true) |
void | computecooc (std::map< PatternPointer, t_relationmap > &coocmap, int threshold, bool right=true, bool left=true) |
int | computeflexgrams_fromskipgrams () |
int | computeflexgrams_fromcooc (double threshold) |
void | outputcooc_npmi (std::ostream *OUT, ClassDecoder &classdecoder, double threshold) |
void | outputcooc (std::ostream *OUT, ClassDecoder &classdecoder, double threshold) |
int | flexgramsize (const Pattern &pattern, IndexReference begin) |
Public Member Functions inherited from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType > | |
PatternModel (IndexedCorpus *corpus=NULL) | |
PatternModel (std::istream *f, PatternModelOptions options, PatternModelInterface *constrainmodel=NULL, IndexedCorpus *corpus=NULL) | |
PatternModel (const std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL, IndexedCorpus *corpus=NULL) | |
~PatternModel () | |
virtual size_t | size () const |
virtual bool | has (const Pattern &pattern) const |
virtual bool | has (const PatternPointer &pattern) const |
virtual void | load (std::string &filename, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL) |
virtual void | load (std::istream *f, const PatternModelOptions &options, PatternModelInterface *constrainmodel=NULL) |
PatternModelInterface * | getinterface () |
virtual int | computeskipgrams (const PatternPointer &pattern, int mintokens=2, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, std::vector< PatternPointer > *targetcontainer=NULL, const bool exhaustive=false, const int maxskips=3, const bool DEBUG=false) |
virtual int | computeskipgrams (const PatternPointer &pattern, PatternModelOptions &options, const IndexReference *singleref=NULL, const IndexedData *multiplerefs=NULL, PatternModelInterface *constrainbymodel=NULL, const bool exhaustive=false) |
virtual std::vector< PatternPointer > | findskipgrams (const PatternPointer &pattern, unsigned int occurrencethreshold=1, int maxskips=3) |
void | test (MapType &target, std::istream *in) |
void | write (std::ostream *out) |
void | write (const std::string filename) |
virtual int | maxlength () const |
virtual int | minlength () const |
virtual unsigned int | occurrencecount (const Pattern &pattern) |
virtual unsigned int | occurrencecount (const PatternPointer &pattern) |
virtual unsigned int | types () |
virtual unsigned int | tokens () const |
unsigned char | type () const |
unsigned char | version () const |
void | output (std::ostream *) |
unsigned int | coveragecount (const Pattern &key) |
double | coverage (const Pattern &key) |
std::vector< PatternPointer > | getreverseindex (const IndexReference ref, int occurrencecount=0, int category=0, unsigned int size=0) |
std::vector< std::pair< IndexReference, PatternPointer > > | getreverseindex_bysentence (int sentence) |
std::vector< std::pair< IndexReference, PatternPointer > > | getreverseindex_right (const IndexReference ref) |
std::vector< std::pair< IndexReference, PatternPointer > > | getreverseindex_left (const IndexReference ref) |
void | computestats () |
virtual void | resetstats () |
unsigned int | totaloccurrencesingroup (int category, int n) |
unsigned int | totalpatternsingroup (int category, int n) |
unsigned int | totalwordtypesingroup (int category, int n) |
unsigned int | totaltokensingroup (int category, int n) |
double | frequency (const Pattern &pattern) |
virtual void | add (const PatternPointer &patternpointer, const IndexReference &ref) |
unsigned int | prune (int threshold, int _n=0) |
virtual unsigned int | pruneskipgrams (unsigned int threshold, int minskiptypes=2, int _n=0) |
unsigned int | prunenotinset (const std::unordered_set< Pattern > &s, int _n) |
unsigned int | prunebymodel (PatternModel< ValueType2, ValueHandler2, MapType2 > &secondmodel) |
std::vector< std::pair< Pattern, int > > | getpatterns (const Pattern &pattern) |
virtual void | print (std::ostream *out, ClassDecoder &decoder, const PatternType &pattern, bool endline=true) |
virtual void | printreverseindex (std::ostream *out, ClassDecoder &decoder) |
void | printmodel (std::ostream *out, ClassDecoder &decoder) |
void | printpattern (std::ostream *out, ClassDecoder &decoder, const Pattern &pattern, bool endline=true) |
void | histogram (std::map< unsigned int, unsigned int > &hist, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0) |
void | histogram (std::ostream *OUT, unsigned int threshold=0, unsigned int cap=0, int category=0, unsigned int size=0) |
unsigned int | topthreshold (int amount, int category=0, int size=0) |
void | info (std::ostream *OUT) |
void | report (std::ostream *OUT) |
PatternSet< uint64_t > | extractset (int minlength=1, int maxlength=1) |
virtual void | outputrelations (const Pattern &pattern, ClassDecoder &classdecoder, std::ostream *OUT) |
virtual t_relationmap | getsubchildren (const Pattern &pattern, int=0, int=0, int=0) |
virtual t_relationmap | getsubparents (const Pattern &pattern, int=0, int=0, int=0) |
virtual t_relationmap | gettemplates (const Pattern &pattern, int=0) |
virtual t_relationmap | getinstances (const Pattern &pattern, int=0) |
virtual t_relationmap | getleftneighbours (const Pattern &pattern, int=0, int=0, int=0, int=0) |
virtual t_relationmap | getrightneighbours (const Pattern &pattern, int=0, int=0, int=0, int=0) |
virtual t_relationmap_double | getnpmi (const Pattern &pattern, double threshold) |
virtual int | computeflexgrams_fromcooc () |
Public Member Functions inherited from PatternModelInterface | |
virtual PatternStoreInterface * | getstoreinterface () |
virtual PatternStoreInterface * | getstoreinterface () |
Protected Member Functions | |
virtual void | postread (const PatternModelOptions options) |
virtual void | posttrain (const PatternModelOptions options) |
Additional Inherited Members | |
Public Types inherited from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType > | |
typedef MapType::iterator | iterator |
typedef MapType::const_iterator | const_iterator |
Public Attributes inherited from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType > | |
IndexedCorpus * | reverseindex |
Pointer to the reverse index and corpus data for this model (or NULL) More... | |
bool | reverseindex_internal |
bool | hasskipgrams |
Does this model have skipgrams? More... | |
Protected Attributes inherited from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType > | |
unsigned char | model_type |
unsigned char | model_version |
uint64_t | totaltokens |
Total number of tokens in the original corpus, so INCLUDES TOKENS NOT COVERED BY THE MODEL! More... | |
uint64_t | totaltypes |
Total number of unigram/word types in the original corpus, SO INCLUDING NOT COVERED BY THE MODEL! More... | |
int | maxn |
int | minn |
std::set< int > | cache_categories |
std::set< int > | cache_n |
std::map< int, std::map< int, unsigned int > > | cache_grouptotal |
total occurrences (used for frequency computation, within a group) More... | |
std::map< int, std::map< int, unsigned int > > | cache_grouptotalpatterns |
total distinct patterns per group More... | |
std::map< int, std::map< int, unsigned int > > | cache_grouptotalwordtypes |
total covered word types per group More... | |
std::map< int, std::map< int, unsigned int > > | cache_grouptotaltokens |
total covered tokens per group More... | |
std::map< int, std::vector< uint32_t > > | gapmasks |
pre-computed masks representing possible gap configurations for various pattern lengths More... | |
An indexed model mapping patterns to values, high-level interface. This is a specialised subclass of PatternMap.
MapType | The type of container to use |
|
inline |
Begin a new pattern model, optionally pre-setting a reverseindex.
|
inline |
Read a pattern model from an input stream
f | The input stream |
options | Options for reading, these act as filter for the data, allowing you to raise thresholds etc |
constrainmodel | Pointer to another pattern model which should be used to constrain the loading of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
corpus | Pointer to the loaded corpus, used as a reverse index. |
|
inline |
Read a pattern model from file
filename | The filename |
options | Options for reading, these act as filter for the data, allowing you to raise thresholds etc |
constrainmodel | Pointer to another pattern model which should be used to constrain the loading of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
corpus | Pointer to the loaded corpus, used as a reverse index. |
|
inlinevirtual |
|
inlinevirtual |
Add a pattern, with a given position, and a value to the model. This is called during training at every time an instance of a pattern is found in the data.
pattern | The pattern to add |
value | A pointer to the value for this pattern, set to NULL and it will be automatically determined |
IndexReference | The position in the corpus where the patterns occurs |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inlinevirtual |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
Reimplemented in IndexedPatternPointerModel< MapType >.
|
inline |
Compute co-occurence as absolute joint occurrence count, for all patterns
coocmap | The map that will store the results |
threshold | Only include pairs passing this NPMI threshold |
right | Compute co-occurence to the right (default: true) |
left | Compute co-occurence to the left (default: true) |
|
inlinevirtual |
Compute coverage statistics on the model, will generally be called automatically by methods who use it, and the statistics are cached after computation.
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inline |
Compute flexgrams using co-occurrence
threshold | Normalised Pointwise Mutual Information threshold |
|
inlinevirtual |
Compute flexgrams by abstracting from existing skipgrams in the model
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inline |
|
inline |
attempt to find the flexgram size for the given begin position, returns 0 if the flexgram was not found at all if there are multiple matches, the shortest is returned
|
inline |
Returns all patterns in the model that co-occur with the given pattern in the same sentence
occurrencethreshold | If set above zero, filters to only include patterns occurring above this threshold |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to include only this category. Set to 0 for unfiltered (default) |
size | Set to any value above zero to only include patterns of the specified length. |
ordersignificant | If set to true, each co-occuring pair will occur at least once in the result, if false (default) it will appear twice, once in A,B form and once in B,A form. |
|
inlinevirtual |
Get the indices stored for the specified pattern.
makeifnew | Add the pattern with empty value if it does not exist (default: false) |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inlinevirtual |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inline |
Returns all ngrams in the model that instantiate the given skipgram/flexgram. If all the gaps in a skipgram/flexgram are filled, we speak of such an instantiation. An occurrence threshold may be used to filter.
|
inline |
Returns all patterns in the model that co-occur with the given pattern in the same sentence and appear to the left of the given pattern
occurrencethreshold | If set above zero, filters to only include patterns occurring above this threshold |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to include only this category. Set to 0 for unfiltered (default) |
size | Set to any value above zero to only include patterns of the specified length. |
|
inline |
Returns all patterns in the model that directly neighbour the given pattern at the left side
occurrencethreshold | If set above zero, filters to only include patterns occurring above this threshold |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to include only this category. Set to 0 for unfiltered (default) |
size | Set to any value above zero to only include patterns of the specified length. |
|
inlinevirtual |
Returns the type of model (a value from ModelType)
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inlinevirtual |
Returns the version of the model implementation and binary serialisation format
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inline |
Return the unigram Pattern that occurs on the specified position, using the reverse index.
|
inline |
Returns all patterns in the model that co-occur with the given pattern in the same sentence and appear to the right of the given pattern
occurrencethreshold | If set above zero, filters to only include patterns occurring above this threshold |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to include only this category. Set to 0 for unfiltered (default) |
size | Set to any value above zero to only include patterns of the specified length. |
|
inline |
Returns all patterns in the model that directly neighbour the given pattern at the right side
occurrencethreshold | If set above zero, filters to only include patterns occurring above this threshold |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to include only this category. Set to 0 for unfiltered (default) |
size | Set to any value above zero to only include patterns of the specified length. |
|
inlinevirtual |
Given a skipgram, returns patterns in the model which would instantiate the skipgram if inserted into the gaps. For skipgrams with multiple gaps, these skip content patterns are themselves skipgrams. Skipgram and skip content complement eachother
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inline |
Returns all patterns in the model that are subsumed by the specified pattern. Subsumed patterns are smaller than the subsuming pattern. Every n-gram (except unigram) by definition subsumes two n-1-grams.
occurrencethreshold | If set above zero, filters to only include patterns occurring above this threshold |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to include only this category. Set to 0 for unfiltered (default) |
size | Set to any value above zero to only include patterns of the specified length. |
|
inline |
Returns all patterns in the model that subsume the specified pattern. Subsuming patterns are larger than the subsuming pattern. Every n-gram (except unigram) by definition subsumes two n-1-grams.
occurrencethreshold | If set above zero, filters to only include patterns occurring above this threshold |
category | Set to any value of PatternCategory (NGRAM,SKIPGRAM,FLEXGRAM) to include only this category. Set to 0 for unfiltered (default) |
size | Set to any value above zero to only include patterns of the specified length. |
|
inline |
Returns all skipgrams and flexgrams in the model that are an abstraction of the specified pattern. Pattern itself may be a skipgram too. An optional occurrence threshold may be used to filter.
|
inline |
Output information about the model to the output stream, includes some statistics and technical details such as space requirements.
|
inline |
Compute normalised pointwise mutual information given two patterns and their joint occurrence count.
|
inlinevirtual |
Compute and output co-occurrence relations as joint occurrence count
threshold | Normalised Pointwise Mutual Information threshold |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inlinevirtual |
Compute and output co-occurrence relations as Normalised Pointwise Mutual Information
threshold | Normalised Pointwise Mutual Information threshold |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inline |
Output the specified relation map for the specified pattern to output stream. Low-level function.
pattern | The pattern |
relations | A relation map |
classdecoder | A class decoder |
OUT | The output stream |
label | A label to insert between relations (defaults to: RELATED-TO) |
|
inline |
Compute and output all possible relations for a given pattern. High-level function.
pattern | The pattern |
classdecoder | A class decoder |
OUT | The output stream |
outputheader | Output a header (default: true) |
|
inlineprotectedvirtual |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inlineprotectedvirtual |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inlinevirtual |
Print the contents of the pattern model, i.e. all patterns and associated counts, to the output stream.
out | The output stream |
decoder | The class decoder to use |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inline |
|
inline |
Given a relation map, prune relations below the specified occurrence threshold
relations | The relationmap to manipulate |
occurrencethreshold | The occurrence threshold |
|
inline |
Prune skipgrams based on an occurrence threshold, and a skiptype threshold. The latter enforces that at least the specified number of distinct patterns must fit in the gap for the skipgram to be retained.
_n | Set to any value above zero to only include patterns of the specified length. |
|
inlinevirtual |
Train a pattern model on corpus data (given an input stream)
in | The input stream of the corpus data (*.colibri.dat), may be NULL if a reverse index is loaded. |
options | Options for training |
constrainbymodel | Pointer to another pattern model which should be used to constrain the training of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
continued | Continued training on the same corpus data |
firstsentence | First sentence index, useful for augmenting a model with another corpus (keep continued set to false in this case), defaults to 1 |
ignoreerrors | Try to ignore errors (use for debug only) |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inlinevirtual |
Train a pattern model on corpus data
filename | The filename of the corpus data (*.colibri.dat) |
options | Options for training |
constrainbymodel | Pointer to another pattern model which should be used to constrain the training of this one, only patterns also occurring in the other model will be included. Defaults to NULL (no constraining) |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.
|
inlinevirtual |
Train skipgrams, for indexed models only
options | Pattern model options |
constrainbymodel | Pointer to a pattern model to use as contraint: only include skipgrams that occur in the constraint model (default: NULL) |
Reimplemented from PatternModel< IndexedData, IndexedDataHandler, MapType, PatternType >.