mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
filled out spec
This commit is contained in:
parent
1493769dee
commit
07b5b2e37b
@ -12,11 +12,27 @@ namespace dlib
|
||||
|
||||
// -----------------------------------------------------------------------------------------
|
||||
|
||||
template <typename T>
|
||||
template <
|
||||
typename T
|
||||
>
|
||||
struct constituent
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This object represents the linguistic idea of a constituent, that is, a
|
||||
group of words that functions as a single unit. In particular, it
|
||||
represents a combination of two constituents into a new constituent.
|
||||
|
||||
Additionally, a constituent object represents a range of words relative to
|
||||
some std::vector of words. The range is from [begin, end) (i.e. including
|
||||
begin but not including end, so using the normal C++ iterator notation).
|
||||
Moreover, a constituent is always composed of two parts, each having a tag.
|
||||
Therefore, the left part is composed of the words in the range [begin,k)
|
||||
and has tag left_tag while the right part of the constituent contains the
|
||||
words in the range [k,end) and has the tag right_tag.
|
||||
|
||||
The tags are user defined objects of type T. In general, they are used to
|
||||
represent syntactic categories such as noun phrase, verb phrase, etc.
|
||||
!*/
|
||||
|
||||
unsigned long begin, end, k;
|
||||
@ -24,7 +40,9 @@ namespace dlib
|
||||
T right_tag;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
template <
|
||||
typename T
|
||||
>
|
||||
void serialize(
|
||||
const constituent<T>& item,
|
||||
std::ostream& out
|
||||
@ -33,7 +51,9 @@ namespace dlib
|
||||
provides serialization support
|
||||
!*/
|
||||
|
||||
template <typename T>
|
||||
template <
|
||||
typename T
|
||||
>
|
||||
void deserialize(
|
||||
constituent<T>& item,
|
||||
std::istream& in
|
||||
@ -51,24 +71,53 @@ namespace dlib
|
||||
|
||||
// -----------------------------------------------------------------------------------------
|
||||
|
||||
template <typename T>
|
||||
template <
|
||||
typename T
|
||||
>
|
||||
struct parse_tree_element
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This object is used to represent a node in a binary parse tree. An entire
|
||||
parse tree is represented by a std::vector of parse_tree_element objects.
|
||||
We follow the convention that the first element of this vector is always
|
||||
the root of the entire tree.
|
||||
|
||||
The fields of this object have the following interpretations:
|
||||
- c == the constituent spanned by this node in the parse tree.
|
||||
Therefore, the node spans the words in the range [c.begin, c.end).
|
||||
- tag == the syntactic category of this node in the parse tree.
|
||||
- score == the score or log likelihood for this parse tree. In
|
||||
general, this is the sum of scores of all the production rules used
|
||||
to build the tree rooted at the current node.
|
||||
- let PT denote the vector of parse_tree_elements that defines an
|
||||
entire parse tree. Then we have:
|
||||
- if (left != END_OF_TREE) then
|
||||
- PT[left] == the left sub-tree of the current node.
|
||||
- PT[left] spans the words [c.begin, c.k)
|
||||
- PT[left].tag == c.left_tag
|
||||
- else
|
||||
- there is no left sub-tree
|
||||
|
||||
- if (right != END_OF_TREE) then
|
||||
- PT[right] == the right sub-tree of the current node.
|
||||
- PT[right] spans the words [c.k, c.end)
|
||||
- PT[right].tag == c.right_tag
|
||||
- else
|
||||
- there is no right sub-tree
|
||||
!*/
|
||||
|
||||
constituent<T> c;
|
||||
T tag; // id for the constituent corresponding to this level of the tree
|
||||
T tag;
|
||||
double score;
|
||||
|
||||
// subtrees. These are the index values into the std::vector that contains all the parse_tree_elements.
|
||||
unsigned long left;
|
||||
unsigned long right;
|
||||
|
||||
double score; // score for this tree
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
template <
|
||||
typename T
|
||||
>
|
||||
void serialize (
|
||||
const parse_tree_element<T>& item,
|
||||
std::ostream& out
|
||||
@ -77,7 +126,9 @@ namespace dlib
|
||||
provides serialization support
|
||||
!*/
|
||||
|
||||
template <typename T>
|
||||
template <
|
||||
typename T
|
||||
>
|
||||
void deserialize (
|
||||
parse_tree_element<T>& item,
|
||||
std::istream& in
|
||||
@ -90,20 +141,30 @@ namespace dlib
|
||||
// -----------------------------------------------------------------------------------------
|
||||
|
||||
void example_production_rule_function (
|
||||
const std::vector<T>& sequence,
|
||||
const std::vector<T>& words,
|
||||
const constituent<T>& c,
|
||||
std::vector<std::pair<T,double> >& possible_tags
|
||||
)
|
||||
/*!
|
||||
requires
|
||||
- 0 <= c.begin < c.k < c.end <= sequence.size()
|
||||
- 0 <= c.begin < c.k < c.end <= words.size()
|
||||
- possible_tags.size() == 0
|
||||
ensures
|
||||
- finds all the production rules that can turn c into a single non-terminal.
|
||||
Puts the IDs of these rules and their scores into possible_tags.
|
||||
- Finds all the syntactic categories that can be used to label c and puts those
|
||||
categories, along with their scores, into possible_tags. Or in other words,
|
||||
this function determines which production rules can be used to turn the left
|
||||
and right sub-constituents in c into a single constituent. The contents of c
|
||||
have the following interpretations:
|
||||
- The left sub-constituent has syntactic category c.left_tag
|
||||
- for all i such that c.begin <= i < c.k:
|
||||
- words[i] is part of the left sub-constituent.
|
||||
- The right sub-constituent has syntactic category c.right_tag
|
||||
- for all i such that c.k <= i < c.end:
|
||||
- words[i] is part of the right sub-constituent.
|
||||
|
||||
- Note that example_production_rule_function() is not a real function. It is
|
||||
here just to show you how to define production rule producing functions
|
||||
for use with the find_max_parse_cky() routine defined below.
|
||||
here just to show you how to define production rule producing functions for
|
||||
use with the find_max_parse_cky() routine defined below.
|
||||
!*/
|
||||
|
||||
template <
|
||||
@ -111,7 +172,7 @@ namespace dlib
|
||||
typename production_rule_function
|
||||
>
|
||||
void find_max_parse_cky (
|
||||
const std::vector<T>& sequence,
|
||||
const std::vector<T>& words,
|
||||
const production_rule_function& production_rules,
|
||||
std::vector<std::vector<parse_tree_element<T> > >& parse_trees
|
||||
);
|
||||
@ -119,6 +180,25 @@ namespace dlib
|
||||
requires
|
||||
- production_rule_function == a function or function object with the same
|
||||
interface as example_production_rule_function defined above.
|
||||
ensures
|
||||
- Uses the CKY algorithm to find the most probable/highest scoring parse tree
|
||||
of the given vector of words. The output is stored in #parse_trees.
|
||||
- This function outputs a set of non-overlapping parse trees. Each parse tree
|
||||
always spans the largest number of words possible, regardless of any other
|
||||
considerations (except that the parse trees cannot have overlapping word
|
||||
spans). For example, this function will never select a smaller parse tree,
|
||||
even if it would have a better score, if it can possibly build a larger tree.
|
||||
Therefore, this function will only output multiple parse trees if it is
|
||||
impossible to form words into a single parse tree.
|
||||
- This function uses production_rules() to find out what the allowed production
|
||||
rules are. That is, production_rules() defines all properties of the grammar
|
||||
used by find_max_parse_cky().
|
||||
- for all valid i:
|
||||
- #parse_trees[i].size() != 0
|
||||
- #parse_trees[i] == the root of the i'th parse tree.
|
||||
- #parse_trees[i].score == the score of the i'th parse tree.
|
||||
- The i'th parse tree spans all the elements of words in the range
|
||||
[#parse_trees[i].c.begin, #parse_trees[i].c.end).
|
||||
!*/
|
||||
|
||||
// -----------------------------------------------------------------------------------------
|
||||
@ -128,31 +208,79 @@ namespace dlib
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is the exception thrown by parse_tree_to_string() and
|
||||
parse_tree_to_string_tagged() if the inputs are discovered to be invalid.
|
||||
!*/
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------------------------
|
||||
|
||||
template <typename T, typename U>
|
||||
template <
|
||||
typename T,
|
||||
typename U
|
||||
>
|
||||
std::string parse_tree_to_string (
|
||||
const std::vector<parse_tree_element<T> >& tree,
|
||||
const std::vector<U>& words
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- It must be possible to print U objects to an ostream using operator<<
|
||||
(typically, U would be something like std::string)
|
||||
ensures
|
||||
-
|
||||
- Interprets tree as a parse tree defined over the given sequence of words.
|
||||
- returns a bracketed string that represents the parse tree over the words.
|
||||
For example, suppose the following parse tree is input:
|
||||
|
||||
/\
|
||||
/ \
|
||||
/\ \
|
||||
/ \ \
|
||||
the dog ran
|
||||
|
||||
Then the output would be the string "[[the dog] ran]"
|
||||
throws
|
||||
- parse_tree_to_string_error
|
||||
This exception is thrown if an invalid tree is detected. This might happen
|
||||
if the tree refers to elements of words that don't exist because words is
|
||||
shorted than it is supposed to be.
|
||||
!*/
|
||||
|
||||
// -----------------------------------------------------------------------------------------
|
||||
|
||||
template <typename T, typename U>
|
||||
template <
|
||||
typename T,
|
||||
typename U
|
||||
>
|
||||
std::string parse_tree_to_string_tagged (
|
||||
const std::vector<parse_tree_element<T> >& tree,
|
||||
const std::vector<U>& words
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- It must be possible to print T objects to an ostream using operator<<
|
||||
- It must be possible to print U objects to an ostream using operator<<
|
||||
(typically, U would be something like std::string)
|
||||
ensures
|
||||
-
|
||||
- This function does the same thing as parse_tree_to_string() except that it
|
||||
also includes the parse_tree_element::tag object in the output. Therefore,
|
||||
the tag of each bracket will be included as the first token inside the
|
||||
bracket. For example, suppose the following parse tree is input (where tags
|
||||
are shown at the vertices):
|
||||
|
||||
S
|
||||
/\
|
||||
NP \
|
||||
/\ \
|
||||
/ \ \
|
||||
the dog ran
|
||||
|
||||
Then the output would be the string "[S [NP the dog] ran]"
|
||||
throws
|
||||
- parse_tree_to_string_error
|
||||
This exception is thrown if an invalid tree is detected. This might happen
|
||||
if the tree refers to elements of words that don't exist because words is
|
||||
shorted than it is supposed to be.
|
||||
!*/
|
||||
|
||||
// -----------------------------------------------------------------------------------------
|
||||
|
Loading…
Reference in New Issue
Block a user