lexicon_collection
pymusas.lexicon_collection
LexiconTypeβ
@unique
class LexiconType(str, Enum)
Descriptions of the type associated to single and Multi Word Expression (MWE)
lexicon entires and templates. Any type with the word NON_SPECIAL
means
that it does not use any special syntax, for example does not use wildcards
or curly braces.
The value
attribute of each instance attribute is of type str
describing
the type associated with that attribute. For the best explanation see the
example below.
Instance AttributesΒΆβ
- SINGLE_NON_SPECIAL :
LexiconType
Single word lexicon lookup. - MWE_NON_SPECIAL :
LexiconType
MWE lexicon lookup. - MWE_WILDCARD :
LexiconType
MWE lexicon lookup using a wildcard. - MWE_CURLY_BRACES :
LexiconType
MWE lexicon lookup using curly braces.
ExamplesΒΆβ
from pymusas.lexicon_collection import LexiconType
assert 'Single Non Special' == LexiconType.SINGLE_NON_SPECIAL
assert 'Single Non Special' == LexiconType.SINGLE_NON_SPECIAL.value
assert 'SINGLE_NON_SPECIAL' == LexiconType.SINGLE_NON_SPECIAL.name
all_possible_values = {'Single Non Special', 'MWE Non Special',
'MWE Wildcard', 'MWE Curly Braces'}
assert all_possible_values == {lexicon_type.value for lexicon_type in LexiconType}
SINGLE_NON_SPECIALβ
class LexiconType(str, Enum):
| ...
| SINGLE_NON_SPECIAL = 'Single Non Special'
MWE_NON_SPECIALβ
class LexiconType(str, Enum):
| ...
| MWE_NON_SPECIAL = 'MWE Non Special'
MWE_WILDCARDβ
class LexiconType(str, Enum):
| ...
| MWE_WILDCARD = 'MWE Wildcard'
MWE_CURLY_BRACESβ
class LexiconType(str, Enum):
| ...
| MWE_CURLY_BRACES = 'MWE Curly Braces'
__repr__β
class LexiconType(str, Enum):
| ...
| def __repr__() -> str
Machine readable string. When printed and run eval()
over the string
you should be able to recreate the object.
LexiconEntryβ
@dataclass(init=True, repr=True, eq=True, order=False,
unsafe_hash=False, frozen=True)
class LexiconEntry
A LexiconEntry contains the semantic_tags
that are associated with a
lemma
and optionally the lemma's POS
.
As frozen is true, the attributes cannot be assigned another value.
This data type is mainly used for single word lexicons, rather than Multi Word Expression (MWE).
Note the parameters to the __init__
are the same as the Instance
Attributes.
Instance AttributesΒΆβ
- lemma :
str
The lemma of a token or the token itself. - semantic_tags :
List[str]
The semantic tags associated with thelemma
and optionalPOS
. The semantic tags are in rank order, the most likely tag is the first tag in the list. - pos :
str
, optional (default =None
)
The Part Of Speech (POS) to be associated with thelemma
.
lemmaβ
class LexiconEntry:
| ...
| lemma: str = None
semantic_tagsβ
class LexiconEntry:
| ...
| semantic_tags: List[str] = None
posβ
class LexiconEntry:
| ...
| pos: Optional[str] = None
LexiconMetaDataβ
@dataclass(init=True, repr=True, eq=True, order=False,
unsafe_hash=False, frozen=True)
class LexiconMetaData
A LexiconMetaData object contains all of the meta data about a given single word or Multi Word Expression (MWE) lexicon entry. This meta data can be used to help rank single and MWE entries when tagging.
As frozen is true, the attributes cannot be assigned another value.
Note the parameters to the __init__
are the same as the Instance
Attributes.
Instance AttributesΒΆβ
- semantic_tags :
List[str]
The semantic tags associated with the lexicon entry. The semantic tags are in rank order, the most likely tag is the first tag in the list. - n_gram_length :
int
The n-gram size of the lexicon entry, e.g.*_noun boot*_noun
will be of length 2 and all single word lexicon entries will be of length 1. - lexicon_type :
LexiconType
Type associated to the lexicon entry. - wildcard_count :
int
Number of wildcards in the lexicon entry, e.g.*_noun boot*_noun
will be 2 andski_noun boot_noun
will be 0.
semantic_tagsβ
class LexiconMetaData:
| ...
| semantic_tags: List[str] = None
n_gram_lengthβ
class LexiconMetaData:
| ...
| n_gram_length: int = None
lexicon_typeβ
class LexiconMetaData:
| ...
| lexicon_type: LexiconType = None
wildcard_countβ
class LexiconMetaData:
| ...
| wildcard_count: int = None
LexiconCollectionβ
class LexiconCollection(MutableMapping):
| ...
| def __init__(
| self,
| data: Optional[Dict[str, List[str]]] = None
| ) -> None
This is a dictionary object that will hold LexiconEntry
data in a fast to
access object. The keys of the dictionary are expected to be either just a
lemma or a combination of lemma and pos in the following format:
{lemma}|{pos}
e.g. Car|Noun
.
The value to each key is the associated semantic tags, whereby the semantic tags are in rank order, the most likely tag is the first tag in the list.
Note that the lemma
can be the token
itself rather than just it's base form, e.g. can be Cars
rather than Car
.
This data type is used for single word lexicons, to store Multi Word
Expression (MWE) see the MWELexiconCollection
.
ParametersΒΆβ
- data :
Dict[str, List[str]]
, optional (default =None
)
Instance AttributesΒΆβ
- data :
Dict[str, List[str]]
Dictionary where the keys are{lemma}|{pos}
and the values are a list of associated semantic tags. If thedata
parameter given wasNone
then the value of this attribute will be an empty dictionary.
ExamplesΒΆβ
from pymusas.lexicon_collection import LexiconEntry, LexiconCollection
lexicon_entry = LexiconEntry('London', ['Z3', 'Z1', 'A1'], 'noun')
collection = LexiconCollection()
collection.add_lexicon_entry(lexicon_entry)
most_likely_tag = collection['London|noun'][0]
assert most_likely_tag == 'Z3'
least_likely_tag = collection['London|noun'][-1]
assert least_likely_tag == 'A1'
add_lexicon_entryβ
class LexiconCollection(MutableMapping):
| ...
| def add_lexicon_entry(
| self,
| value: LexiconEntry,
| include_pos: bool = True
| ) -> None
Will add the LexiconEntry
to the collection, whereby the key is the
combination of the lemma and pos and the value are the semantic tags.
The lemma and pos are combined as follows: {lemma}|{pos}
, e.g.
Car|Noun
If the pos value is None then then only the lemma is used: {lemma}
,
e.g. Car
ParametersΒΆβ
- value :
LexiconEntry
Lexicon Entry to add to the collection. - include_pos :
bool
, optional (default =True
)
Whether to include the POS tag within the key.
to_dictionaryβ
class LexiconCollection(MutableMapping):
| ...
| def to_dictionary() -> Dict[str, List[str]]
Returns the data
instance attribute.
ReturnsΒΆβ
Dict[str, List[str]]
to_bytesβ
class LexiconCollection(MutableMapping):
| ...
| def to_bytes() -> bytes
Serialises the LexiconCollection
to a bytestring.
ReturnsΒΆβ
bytes
from_bytesβ
class LexiconCollection(MutableMapping):
| ...
| @staticmethod
| def from_bytes(bytes_data: bytes) -> "LexiconCollection"
Loads LexiconCollection
from the given bytestring and
returns it.
ParametersΒΆβ
- bytes_data :
bytes
The bytestring to load.
ReturnsΒΆβ
from_tsvβ
class LexiconCollection(MutableMapping):
| ...
| @staticmethod
| def from_tsv(
| tsv_file_path: Union[PathLike, str],
| include_pos: bool = True
| ) -> Dict[str, List[str]]
Given a tsv_file_path
it will return a dictionary object that can
be used to create a LexiconCollection
.
Each line in the TSV file will be read in as a LexiconEntry
and added to a temporary LexiconCollection
, once all lines
in the TSV have been parsed the return value is the data
attribute of
the temporary LexiconCollection
.
If the file path is a URL, the file will be downloaded and cached using
pymusas.file_utils.download_url_file
.
If include_pos
is True and the TSV file does not contain a
pos
field heading then this will return a LexiconCollection that is
identical to a collection that ran this method with include_pos
equal
to False.
Code reference, the identification of a URL and the idea to do this has come from the AllenNLP library
ParametersΒΆβ
tsv_file_path :
Union[PathLike, str]
A file path or URL to a TSV file that contains at least two fields, with an optional third, with the following headings:lemma
,semantic_tags
pos
(Optional)All other fields will be ignored.
include_pos :
bool
, optional (default =True
)
Whether to include the POS information, if the information is avaliable, or not. Seeadd_lexicon_entry
for more information on this parameter.
ReturnsΒΆβ
Dict[str, List[str]]
RaisesΒΆβ
ValueError
If the minimum field headings,lemma
andsemantic_tags
, do not exist in the given TSV file.
ExamplesΒΆβ
include_pos
= True
from pymusas.lexicon_collection import LexiconCollection
welsh_lexicon_url = 'https://raw.githubusercontent.com/apmoore1/Multilingual-USAS/master/Welsh/semantic_lexicon_cy.tsv'
welsh_lexicon_dict = LexiconCollection.from_tsv(welsh_lexicon_url, include_pos=True)
welsh_lexicon_collection = LexiconCollection(welsh_lexicon_dict)
assert welsh_lexicon_dict['ceir|noun'][0] == 'M3fn'
assert welsh_lexicon_dict['ceir|verb'][0] == 'A9+'
include_pos
= False
from pymusas.lexicon_collection import LexiconCollection
welsh_lexicon_url = 'https://raw.githubusercontent.com/apmoore1/Multilingual-USAS/master/Welsh/semantic_lexicon_cy.tsv'
welsh_lexicon_dict = LexiconCollection.from_tsv(welsh_lexicon_url, include_pos=False)
welsh_lexicon_collection = LexiconCollection(welsh_lexicon_dict)
assert welsh_lexicon_dict['ceir'][0] == 'M3fn'
__str__β
class LexiconCollection(MutableMapping):
| ...
| def __str__() -> str
Human readable string.
__repr__β
class LexiconCollection(MutableMapping):
| ...
| def __repr__() -> str
Machine readable string. When printed and run eval()
over the string
you should be able to recreate the object.
__eq__β
class LexiconCollection(MutableMapping):
| ...
| def __eq__(other: object) -> bool
Given another object to compare too it will return True
if the other
object is the same class and contains the same data
instance attribute.
ParametersΒΆβ
- other :
object
The object to compare too.
ReturnsΒΆβ
True
MWELexiconCollectionβ
class MWELexiconCollection(MutableMapping):
| ...
| def __init__(
| self,
| data: Optional[Dict[str, List[str]]] = None,
| pos_mapper: Optional[Dict[str, List[str]]] = None
| ) -> None
A collection that stores Multi Word Expression (MWE) templates and their associated meta data.
This collection allows users to:
- Easily load MWE templates from a single TSV file.
- Find strings that match MWE templates taking into account any special syntax rules that should be applied, e.g. wildcards allow zero or more characters to appear after the word token and/or Part Of Speech (POS) tag. For more information on the MWE special syntax rules see the following notes.
- POS mapping, it can find strings that match MWE templates while taking into account mapping from one POS tagset to another in both a one to one and one to many mapping.
Note that even though this a sub-class of a MutableMapping it has a
time complexity of O(n) for deletion unlike the standard Python MutableMapping,
see the following dict time complexities,
this is due to keeping track of the longest_non_special_mwe_template
and
longest_wildcard_mwe_template
.
As we do not currently support curly braces MWE template syntax, therefore
any MWE templates that contain a {
or }
will be ignored and will not be
added to this collection, in addition a UserWarning
will be raised stating
this.
ParametersΒΆβ
- data :
Dict[str, List[str]]
, optional (default =None
)
Dictionary where the keys are MWE templates, of anyLexiconType
, and the values are a list of associated semantic tags. - pos_mapper :
Dict[str, List[str]]
, optional (default =None
)
If notNone
, maps from the lexicon's POS tagset to the desired POS tagset, whereby the mapping is aList
of tags, at the moment there is no preference order in this list of POS tags. The POS mapping is useful in situtation whereby the leixcon's POS tagset is different to the token's. Note that the longer theList[str]
for each POS mapping the longer it will take to match MWE templates. A one to one mapping will have no speed impact on the tagger. A selection of POS mappers can be found inpymusas.pos_mapper
.
Instance AttributesΒΆβ
Note if the data
parameter given was None
then the value of all
dictionary attributes will be an empty dictionary and all integer values will
be 0
. If pos_mapper
parameter was None
then the pos_mapper
attribute
will be an empty dictionary.
- meta_data :
Dict[str, LexiconMetaData]
Dictionary where the keys are MWE templates, of any type, and the values are their associated meta data stored in aLexiconMetaData
object. - longest_non_special_mwe_template :
int
The longest MWE template with no special symbols measured by n-gram size. For example the MWE templateski_noun boot_noun
will be of length 2. - longest_wildcard_mwe_template :
int
The longest MWE template with at least one wildcard (*
) measured by n-gram size. For example the MWE template*_noun boot*_noun
will be of length 2. - longest_mwe_template :
int
The longest MWE template regardless of type measured by n-gram size. - most_wildcards_in_mwe_template :
int
The number of wildcards in the MWE template that contains the most wildcards, e.g. the MWE templateski_* *_noun
would contain 2 wildcards. This can be 0 if you have no wildcard MWE templates. - mwe_regular_expression_lookup :
Dict[int, Dict[str, Dict[str, re.Pattern]]]
A dictionary that can lookup all special syntax MWE templates there regular expression pattern. These templates are found first by their n-gram length and then their first character symbol. The regular expression pattern is used for quick matching within themwe_match
. From the special syntax only wildcard (*
) symbols are supported at the moment. - pos_mapper :
Dict[str, List[str]]
The givenpos_mapper
. - one_to_many_pos_tags :
Set[str]
A set of POS tags that have a one to many mapping, this is created based on thepos_mapper
. This is empty ifpos_mapper
isNone
- pos_mapping_lookup :
Dict[str, str]
Only used ifpos_mapper
is notNone
. For all one-to-one POS mappings will store the mapped POS MWE template as keys and the original non-mapped (original) MWE templates as values, which can be used to lookup the meta data frommeta_data
. - pos_mapping_regular_expression_lookup :
Dict[LexiconType, Dict[int, Dict[str, Dict[str, re.Pattern]]]]
Only used ifpos_mapper
is notNone
and will result inmwe_regular_expression_lookup
being empty as it replaces it functionality and extends it and by handlining the one-to-many POS mapping cases. When we have a one-to-many POS mapping case this requires a regular expression mapping even for non special syntax MWE templates. Compared to themwe_regular_expression_lookup
the first set of keys represent the lexicon entry match type.
ExamplesΒΆβ
import re
from pymusas.lexicon_collection import MWELexiconCollection, LexiconType
mwe_collection = MWELexiconCollection()
mwe_collection['*_noun boot*_noun'] = ['Z0', 'Z3']
meta_data = mwe_collection['*_noun boot*_noun']
assert 2 == meta_data.n_gram_length
assert LexiconType.MWE_WILDCARD == meta_data.lexicon_type
assert 2 == meta_data.wildcard_count
most_likely_tag = meta_data.semantic_tags[0]
assert most_likely_tag == 'Z0'
least_likely_tag = meta_data.semantic_tags[-1]
assert least_likely_tag == 'Z3'
# change defaultdict to dict so the dictionary is easier to read and understand
assert ({k: dict(v) for k, v in mwe_collection.mwe_regular_expression_lookup.items()}
== {2: {'*': {'*_noun boot*_noun': re.compile('[^\\s_]*_noun\\ boot[^\\s_]*_noun')}}})
mwe_matchβ
class MWELexiconCollection(MutableMapping):
| ...
| def mwe_match(
| self,
| mwe_template: str,
| mwe_type: LexiconType
| ) -> List[str]
Returns a List
of MWE templates, with the given mwe_type
, that match
the given mwe_template
. If there are no matches the returned List
will be empty.
This method applies all of the special syntax rules that should be applied e.g. wildcards allow zero or more characters to appear after the word token and/or Part Of Speech (POS) tag. For more information on the MWE special syntax rules see the following notes.
ParametersΒΆβ
- mwe_template :
str
The MWE template that you want to match against, e.g.river_noun bank_noun
orski_noun boots_noun
- mwe_type :
LexiconType
The type of MWE templates that you want to return.
ReturnsΒΆβ
Optional[List[str]]
ExamplesΒΆβ
from pymusas.lexicon_collection import MWELexiconCollection, LexiconType
collection = MWELexiconCollection({'walking_noun boot_noun': ['Z2'], 'ski_noun boot_noun': ['Z2'], '*_noun boot_noun': ['Z2'], '*_noun *_noun': ['Z2']})
assert [] == collection.mwe_match('river_noun bank_noun', LexiconType.MWE_NON_SPECIAL)
assert ['walking_noun boot_noun'] == collection.mwe_match('walking_noun boot_noun', LexiconType.MWE_NON_SPECIAL)
assert ['*_noun boot_noun', '*_noun *_noun'] == collection.mwe_match('walking_noun boot_noun', LexiconType.MWE_WILDCARD)
to_dictionaryβ
class MWELexiconCollection(MutableMapping):
| ...
| def to_dictionary() -> Dict[str, List[str]]
Returns a dictionary of all MWE templates, the keys, stored in the collection and their associated semantic tags, the values.
This can then be used to re-create a MWELexiconCollection
.
ReturnsΒΆβ
Dict[str, List[str]]
ExamplesΒΆβ
from pymusas.lexicon_collection import (MWELexiconCollection,
LexiconType, LexiconMetaData)
mwe_collection = MWELexiconCollection()
mwe_collection['*_noun boot*_noun'] = ['Z0', 'Z3']
assert (mwe_collection['*_noun boot*_noun']
== LexiconMetaData(['Z0', 'Z3'], 2, LexiconType.MWE_WILDCARD, 2))
assert (mwe_collection.to_dictionary()
== {'*_noun boot*_noun': ['Z0', 'Z3']})
to_bytesβ
class MWELexiconCollection(MutableMapping):
| ...
| def to_bytes() -> bytes
Serialises the MWELexiconCollection
to a bytestring.
ReturnsΒΆβ
bytes
from_bytesβ
class MWELexiconCollection(MutableMapping):
| ...
| @staticmethod
| def from_bytes(bytes_data: bytes) -> "MWELexiconCollection"
Loads MWELexiconCollection
from the given bytestring and
returns it.
ParametersΒΆβ
- bytes_data :
bytes
The bytestring to load.
ReturnsΒΆβ
from_tsvβ
class MWELexiconCollection(MutableMapping):
| ...
| @staticmethod
| def from_tsv(
| tsv_file_path: Union[PathLike, str]
| ) -> Dict[str, List[str]]
Given a tsv_file_path
it will return a dictionary object
that can be used to create a MWELexiconCollection
.
Each line in the TSV file will be read in and added to a temporary
MWELexiconCollection
, once all lines
in the TSV have been parsed, the return value is the data
attribute of
the temporary MWELexiconCollection
.
If the file path is a URL, the file will be downloaded and cached using
pymusas.file_utils.download_url_file
.
Code reference, the identification of a URL and the idea to do this has come from the AllenNLP library
ParametersΒΆβ
tsv_file_path :
Union[PathLike, str]
A file path or URL to a TSV file that contains at least these two fields:mwe_template
,semantic_tags
All other fields will be ignored.
ReturnsΒΆβ
Dict[str, List[str]]
RaisesΒΆβ
ValueError
If the minimum field headings,mwe_template
andsemantic_tags
, do not exist in the given TSV file.
ExamplesΒΆβ
from pymusas.lexicon_collection import MWELexiconCollection
portuguese_lexicon_url = 'https://raw.githubusercontent.com/UCREL/Multilingual-USAS/master/Portuguese/mwe-pt.tsv'
mwe_lexicon_dict = MWELexiconCollection.from_tsv(portuguese_lexicon_url)
mwe_lexicon_collection = MWELexiconCollection(mwe_lexicon_dict)
assert mwe_lexicon_dict['abaixo_adv de_prep'][0] == 'M6'
assert mwe_lexicon_dict['arco_noun e_conj flecha_noun'][0] == 'K5.1'
escape_mweβ
class MWELexiconCollection(MutableMapping):
| ...
| @staticmethod
| def escape_mwe(mwe_template: str) -> str
Returns the MWE template escaped so that it can be used in a regular expression.
The difference between this and the normal re.escape
method, is that we apply the re.escape
method to the tokens in the
MWE template and then replace \*
with [^\s_]*
so that the wildcards
keep there original meaning with respect to the MWE special syntax rules.
Furthermore, the POS tags in the MWE template replace the *
with
[^\s_]*
.
ParametersΒΆβ
- mwe_template :
str
The MWE template that you want to escape, e.g.river_noun bank_noun
or*_noun boot*_noun
ReturnsΒΆβ
str
ExamplesΒΆβ
from pymusas.lexicon_collection import MWELexiconCollection
mwe_escaped = MWELexiconCollection.escape_mwe('ano*_prep carta_noun')
assert r'ano[^\s_]*_prep\ carta_noun' == mwe_escaped
mwe_escaped = MWELexiconCollection.escape_mwe('ano_prep carta_*')
assert r'ano_prep\ carta_[^\s_]*' == mwe_escaped
__setitem__β
class MWELexiconCollection(MutableMapping):
| ...
| def __setitem__(key: str, value: List[str]) -> None
RaisesΒΆβ
ValueError
If using apos_mapper
all POS tags within a MWE template cannot contain any wildcards or the POS tags can only be a wildcard, if this is not the case aValueError
will be raised.
__str__β
class MWELexiconCollection(MutableMapping):
| ...
| def __str__() -> str
Human readable string.
__repr__β
class MWELexiconCollection(MutableMapping):
| ...
| def __repr__() -> str
Machine readable string. When printed and run eval()
over the string
you should be able to recreate the object.
__eq__β
class MWELexiconCollection(MutableMapping):
| ...
| def __eq__(other: object) -> bool
Given another object to compare too it will return True
if the other
object is the same class and contains the same meta_data
and
pos_mapper
instance attributes.
ParametersΒΆβ
- other :
object
The object to compare too.
ReturnsΒΆβ
True