lexicon_collection
pymusas.lexicon_collection
LexiconType
@unique
class LexiconType(str, Enum)
Descriptions of the type associated to single and Multi Word Expression (MWE)
lexicon entires and templates. Any type with the word NON_SPECIAL means
that it does not use any special syntax, for example does not use wildcards
or curly braces.
The value attribute of each instance attribute is of type str describing
the type associated with that attribute. For the best explanation see the
example below.
Instance Attributes¶
- SINGLE_NON_SPECIAL :
LexiconType
Single word lexicon lookup. - MWE_NON_SPECIAL :
LexiconType
MWE lexicon lookup. - MWE_WILDCARD :
LexiconType
MWE lexicon lookup using a wildcard. - MWE_CURLY_BRACES :
LexiconType
MWE lexicon lookup using curly braces.
Examples¶
from pymusas.lexicon_collection import LexiconType
assert 'Single Non Special' == LexiconType.SINGLE_NON_SPECIAL
assert 'Single Non Special' == LexiconType.SINGLE_NON_SPECIAL.value
assert 'SINGLE_NON_SPECIAL' == LexiconType.SINGLE_NON_SPECIAL.name
all_possible_values = {'Single Non Special', 'MWE Non Special',
'MWE Wildcard', 'MWE Curly Braces'}
assert all_possible_values == {lexicon_type.value for lexicon_type in LexiconType}
SINGLE_NON_SPECIAL
class LexiconType(str, Enum):
| ...
| SINGLE_NON_SPECIAL = 'Single Non Special'
MWE_NON_SPECIAL
class LexiconType(str, Enum):
| ...
| MWE_NON_SPECIAL = 'MWE Non Special'
MWE_WILDCARD
class LexiconType(str, Enum):
| ...
| MWE_WILDCARD = 'MWE Wildcard'
MWE_CURLY_BRACES
class LexiconType(str, Enum):
| ...
| MWE_CURLY_BRACES = 'MWE Curly Braces'
__repr__
class LexiconType(str, Enum):
| ...
| def __repr__() -> str
Machine readable string. When printed and run eval() over the string
you should be able to recreate the object.
LexiconEntry
@dataclass(init=True, repr=True, eq=True, order=False,
unsafe_hash=False, frozen=True)
class LexiconEntry
A LexiconEntry contains the semantic_tags that are associated with a
lemma and optionally the lemma's POS.
As frozen is true, the attributes cannot be assigned another value.
This data type is mainly used for single word lexicons, rather than Multi Word Expression (MWE).
Note the parameters to the __init__ are the same as the Instance
Attributes.
Instance Attributes¶
- lemma :
str
The lemma of a token or the token itself. - semantic_tags :
List[str]
The semantic tags associated with thelemmaand optionalPOS. The semantic tags are in rank order, the most likely tag is the first tag in the list. - pos :
str, optional (default =None)
The Part Of Speech (POS) to be associated with thelemma.
lemma
class LexiconEntry:
| ...
| lemma: str = None
semantic_tags
class LexiconEntry:
| ...
| semantic_tags: List[str] = None
pos
class LexiconEntry:
| ...
| pos: Optional[str] = None
LexiconMetaData
@dataclass(init=True, repr=True, eq=True, order=False,
unsafe_hash=False, frozen=True)
class LexiconMetaData
A LexiconMetaData object contains all of the meta data about a given single word or Multi Word Expression (MWE) lexicon entry. This meta data can be used to help rank single and MWE entries when tagging.
As frozen is true, the attributes cannot be assigned another value.
Note the parameters to the __init__ are the same as the Instance
Attributes.
Instance Attributes¶
- semantic_tags :
List[str]
The semantic tags associated with the lexicon entry. The semantic tags are in rank order, the most likely tag is the first tag in the list. - n_gram_length :
int
The n-gram size of the lexicon entry, e.g.*_noun boot*_nounwill be of length 2 and all single word lexicon entries will be of length 1. - lexicon_type :
LexiconType
Type associated to the lexicon entry. - wildcard_count :
int
Number of wildcards in the lexicon entry, e.g.*_noun boot*_nounwill be 2 andski_noun boot_nounwill be 0.
semantic_tags
class LexiconMetaData:
| ...
| semantic_tags: List[str] = None
n_gram_length
class LexiconMetaData:
| ...
| n_gram_length: int = None
lexicon_type
class LexiconMetaData:
| ...
| lexicon_type: LexiconType = None
wildcard_count
class LexiconMetaData:
| ...
| wildcard_count: int = None
LexiconCollection
class LexiconCollection(MutableMapping):
| ...
| def __init__(
| self,
| data: Optional[Dict[str, List[str]]] = None
| ) -> None
This is a dictionary object that will hold LexiconEntry data in a fast to
access object. The keys of the dictionary are expected to be either just a
lemma or a combination of lemma and pos in the following format:
{lemma}|{pos} e.g. Car|Noun.
The value to each key is the associated semantic tags, whereby the semantic tags are in rank order, the most likely tag is the first tag in the list.
Note that the lemma can be the token
itself rather than just it's base form, e.g. can be Cars rather than Car.
This data type is used for single word lexicons, to store Multi Word
Expression (MWE) see the MWELexiconCollection.
Parameters¶
- data :
Dict[str, List[str]], optional (default =None)
Instance Attributes¶
- data :
Dict[str, List[str]]
Dictionary where the keys are{lemma}|{pos}and the values are a list of associated semantic tags. If thedataparameter given wasNonethen the value of this attribute will be an empty dictionary.
Examples¶
from pymusas.lexicon_collection import LexiconEntry, LexiconCollection
lexicon_entry = LexiconEntry('London', ['Z3', 'Z1', 'A1'], 'noun')
collection = LexiconCollection()
collection.add_lexicon_entry(lexicon_entry)
most_likely_tag = collection['London|noun'][0]
assert most_likely_tag == 'Z3'
least_likely_tag = collection['London|noun'][-1]
assert least_likely_tag == 'A1'
add_lexicon_entry
class LexiconCollection(MutableMapping):
| ...
| def add_lexicon_entry(
| self,
| value: LexiconEntry,
| include_pos: bool = True
| ) -> None
Will add the LexiconEntry to the collection, whereby the key is the
combination of the lemma and pos and the value are the semantic tags.
The lemma and pos are combined as follows: {lemma}|{pos}, e.g.
Car|Noun
If the pos value is None then only the lemma is used: {lemma},
e.g. Car
Note If the key already exists then the most recent entry will overwrite the existing entry.
Parameters¶
- value :
LexiconEntry
Lexicon Entry to add to the collection. - include_pos :
bool, optional (default =True)
Whether to include the POS tag within the key.
to_dictionary
class LexiconCollection(MutableMapping):
| ...
| def to_dictionary() -> Dict[str, List[str]]
Returns the data instance attribute.
Returns¶
Dict[str, List[str]]
to_bytes
class LexiconCollection(MutableMapping):
| ...
| def to_bytes() -> bytes
Serialises the LexiconCollection to a bytestring.
Returns¶
bytes
from_bytes
class LexiconCollection(MutableMapping):
| ...
| @staticmethod
| def from_bytes(bytes_data: bytes) -> "LexiconCollection"
Loads LexiconCollection from the given bytestring and
returns it.
Parameters¶
- bytes_data :
bytes
The bytestring to load.
Returns¶
from_tsv
class LexiconCollection(MutableMapping):
| ...
| @staticmethod
| def from_tsv(
| tsv_file_path: Union[PathLike, str],
| include_pos: bool = True
| ) -> Dict[str, List[str]]
Given a tsv_file_path it will return a dictionary object that can
be used to create a LexiconCollection.
Each line in the TSV file will be read in as a LexiconEntry
and added to a temporary LexiconCollection, once all lines
in the TSV have been parsed the return value is the data attribute of
the temporary LexiconCollection.
If the file path is a URL, the file will be downloaded and cached using
pymusas.file_utils.download_url_file.
If include_pos is True and the TSV file does not contain a
pos field heading then this will return a LexiconCollection that is
identical to a collection that ran this method with include_pos equal
to False.
Code reference, the identification of a URL and the idea to do this has come from the AllenNLP library
Parameters¶
-
tsv_file_path :
Union[PathLike, str]
A file path or URL to a TSV file that contains at least two fields, with an optional third, with the following headings:lemma,semantic_tagspos(Optional)
All other fields will be ignored.
-
include_pos :
bool, optional (default =True)
Whether to include the POS information, if the information is avaliable, or not. Seeadd_lexicon_entryfor more information on this parameter.
Returns¶
Dict[str, List[str]]
Raises¶
ValueError
If the minimum field headings,lemmaandsemantic_tags, do not exist in the given TSV file.
Examples¶
include_pos = True
from pymusas.lexicon_collection import LexiconCollection
welsh_lexicon_url = 'https://raw.githubusercontent.com/apmoore1/Multilingual-USAS/master/Welsh/semantic_lexicon_cy.tsv'
welsh_lexicon_dict = LexiconCollection.from_tsv(welsh_lexicon_url, include_pos=True)
welsh_lexicon_collection = LexiconCollection(welsh_lexicon_dict)
assert welsh_lexicon_dict['ceir|noun'][0] == 'M3fn'
assert welsh_lexicon_dict['ceir|verb'][0] == 'A9+'
include_pos = False
from pymusas.lexicon_collection import LexiconCollection
welsh_lexicon_url = 'https://raw.githubusercontent.com/apmoore1/Multilingual-USAS/master/Welsh/semantic_lexicon_cy.tsv'
welsh_lexicon_dict = LexiconCollection.from_tsv(welsh_lexicon_url, include_pos=False)
welsh_lexicon_collection = LexiconCollection(welsh_lexicon_dict)
assert welsh_lexicon_dict['ceir'][0] == 'M3fn'
merge
class LexiconCollection(MutableMapping):
| ...
| @staticmethod
| def merge(
| *lexicon_collections: "LexiconCollection"
| ) -> "LexiconCollection"
Given more than one lexicon collection it will create a single lexicon collection whereby the lexicon data from each will be combined.
Note the data is loaded in list order therefore the last lexicon
collection will take precedence, i.e. if the last contains London: [Z3]
and the first contains London: [Z2] then the returned
LexiconCollection will only contain the one entry; London: [Z3].
Note if the lexicon collections contain POS information we assume that all of the lexicon collections use the same POS tagset, if they do not this could cause issues during tag time.
Parameters¶
- *lexicon_collections :
LexiconCollection
More than one lexicon collections that are to be merged.
Returns¶
Examples¶
from pymusas.lexicon_collection import LexiconCollection
welsh_lexicon_url = "https://raw.githubusercontent.com/UCREL/Multilingual-USAS/refs/heads/master/Welsh/semantic_lexicon_cy.tsv"
english_lexicon_url = "https://raw.githubusercontent.com/UCREL/Multilingual-USAS/refs/heads/master/English/semantic_lexicon_en.tsv"
welsh_lexicon_data = LexiconCollection.from_tsv(welsh_lexicon_url, include_pos=True)
welsh_lexicon = LexiconCollection(welsh_lexicon_data)
english_lexicon_data = LexiconCollection.from_tsv(english_lexicon_url, include_pos=True)
english_lexicon = LexiconCollection(english_lexicon_data)
combined_lexicon_collection = LexiconCollection.merge(welsh_lexicon, english_lexicon)
assert isinstance(combined_lexicon_collection, LexiconCollection)
assert combined_lexicon_collection["Aber-lash|pnoun"] == ["Z2"]
assert combined_lexicon_collection["Aqua|PROPN"] == ["Z3c"]
tsv_merge
class LexiconCollection(MutableMapping):
| ...
| @staticmethod
| def tsv_merge(
| *tsv_file_paths: PathLike,
| *,
| include_pos: bool = True
| ) -> dict[str, list[str]]
Given one or more TSV files it will create a single dictionary object
with the combination of all the lexicon data in each TSV, this dictionary
object can then be used to create a LexiconCollection.
For more information on how the TSV data is loaded see from_tsv.
Note the data is loaded in list order therefore the last TSV file
will take precedence, i.e. if the last TSV file contains London: [Z3]
and the first TSV file contains London: [Z2] then the returned
dictionary will only contain the one entry; London: [Z3].
Note if the TSV files contain POS information we assume that all of the TSV files use the same POS tagset, if they do not this could cause issues during tag time.
Parameters¶
-
*tsv_file_paths :
PathLike
File paths and/or URLs to a TSV file that contains at least two fields, with an optional third, with the following headings:lemma,semantic_tagspos(Optional)
All other fields will be ignored.
-
include_pos :
bool, optional (default =True)
Whether to include the POS information, if the information is available, or not. Seeadd_lexicon_entryfor more information on this parameter.
Returns¶
dict[str, list[str]]
Raises¶
ValueError
If the minimum field headings,lemmaandsemantic_tags, do not exist in the given TSV files.
Examples¶
from pymusas.lexicon_collection import LexiconCollection
welsh_lexicon_url = "https://raw.githubusercontent.com/UCREL/Multilingual-USAS/refs/heads/master/Welsh/semantic_lexicon_cy.tsv"
english_lexicon_url = "https://raw.githubusercontent.com/UCREL/Multilingual-USAS/refs/heads/master/English/semantic_lexicon_en.tsv"
tsv_urls = [welsh_lexicon_url, english_lexicon_url]
combined_lexicon_collection = LexiconCollection.tsv_merge(*tsv_urls, include_pos=True)
assert isinstance(combined_lexicon_collection, dict)
assert combined_lexicon_collection["Aber-lash|pnoun"] == ["Z2"]
assert combined_lexicon_collection["Aqua|PROPN"] == ["Z3c"]
__str__
class LexiconCollection(MutableMapping):
| ...
| def __str__() -> str
Human readable string.
__repr__
class LexiconCollection(MutableMapping):
| ...
| def __repr__() -> str
Machine readable string. When printed and run eval() over the string
you should be able to recreate the object.
__eq__
class LexiconCollection(MutableMapping):
| ...
| def __eq__(other: object) -> bool
Given another object to compare too it will return True if the other
object is the same class and contains the same data instance attribute.
Parameters¶
- other :
object
The object to compare too.
Returns¶
True
MWELexiconCollection
class MWELexiconCollection(MutableMapping):
| ...
| def __init__(
| self,
| data: Optional[Dict[str, List[str]]] = None,
| pos_mapper: Optional[Dict[str, List[str]]] = None
| ) -> None
A collection that stores Multi Word Expression (MWE) templates and their associated meta data.
This collection allows users to:
- Easily load MWE templates from a single TSV file.
- Find strings that match MWE templates taking into account any special syntax rules that should be applied, e.g. wildcards allow zero or more characters to appear after the word token and/or Part Of Speech (POS) tag. For more information on the MWE special syntax rules see the following notes.
- POS mapping, it can find strings that match MWE templates while taking into account mapping from one POS tagset to another in both a one to one and one to many mapping.
Note that even though this a sub-class of a MutableMapping it has a
time complexity of O(n) for deletion unlike the standard Python MutableMapping,
see the following dict time complexities,
this is due to keeping track of the longest_non_special_mwe_template and
longest_wildcard_mwe_template.
As we do not currently support curly braces MWE template syntax, therefore
any MWE templates that contain a { or } will be ignored and will not be
added to this collection, in addition a UserWarning will be raised stating
this.
Parameters¶
- data :
Dict[str, List[str]], optional (default =None)
Dictionary where the keys are MWE templates, of anyLexiconType, and the values are a list of associated semantic tags. - pos_mapper :
Dict[str, List[str]], optional (default =None)
If notNone, maps from the lexicon's POS tagset to the desired POS tagset, whereby the mapping is aListof tags, at the moment there is no preference order in this list of POS tags. The POS mapping is useful in situations whereby the lexicon's POS tagset is different to the token's. Note that the longer theList[str]for each POS mapping the longer it will take to match MWE templates. A one to one mapping will have no speed impact on the tagger. A selection of POS mappers can be found inpymusas.pos_mapper.
Instance Attributes¶
Note if the data parameter given was None then the value of all
dictionary attributes will be an empty dictionary and all integer values will
be 0. If pos_mapper parameter was None then the pos_mapper attribute
will be an empty dictionary.
- meta_data :
Dict[str, LexiconMetaData]
Dictionary where the keys are MWE templates, of any type, and the values are their associated meta data stored in aLexiconMetaDataobject. - longest_non_special_mwe_template :
int
The longest MWE template with no special symbols measured by n-gram size. For example the MWE templateski_noun boot_nounwill be of length 2. - longest_wildcard_mwe_template :
int
The longest MWE template with at least one wildcard (*) measured by n-gram size. For example the MWE template*_noun boot*_nounwill be of length 2. - longest_mwe_template :
int
The longest MWE template regardless of type measured by n-gram size. - most_wildcards_in_mwe_template :
int
The number of wildcards in the MWE template that contains the most wildcards, e.g. the MWE templateski_* *_nounwould contain 2 wildcards. This can be 0 if you have no wildcard MWE templates. - mwe_regular_expression_lookup :
Dict[int, Dict[str, Dict[str, re.Pattern]]]
A dictionary that can lookup all special syntax MWE templates there regular expression pattern. These templates are found first by their n-gram length and then their first character symbol. The regular expression pattern is used for quick matching within themwe_match. From the special syntax only wildcard (*) symbols are supported at the moment. - pos_mapper :
Dict[str, List[str]]
The givenpos_mapper. - one_to_many_pos_tags :
Set[str]
A set of POS tags that have a one to many mapping, this is created based on thepos_mapper. This is empty ifpos_mapperisNone - pos_mapping_lookup :
Dict[str, str]
Only used ifpos_mapperis notNone. For all one-to-one POS mappings will store the mapped POS MWE template as keys and the original non-mapped (original) MWE templates as values, which can be used to lookup the meta data frommeta_data. - pos_mapping_regular_expression_lookup :
Dict[LexiconType, Dict[int, Dict[str, Dict[str, re.Pattern]]]]
Only used ifpos_mapperis notNoneand will result inmwe_regular_expression_lookupbeing empty as it replaces it functionality and extends it and by handlining the one-to-many POS mapping cases. When we have a one-to-many POS mapping case this requires a regular expression mapping even for non special syntax MWE templates. Compared to themwe_regular_expression_lookupthe first set of keys represent the lexicon entry match type.
Examples¶
import re
from pymusas.lexicon_collection import MWELexiconCollection, LexiconType
mwe_collection = MWELexiconCollection()
mwe_collection['*_noun boot*_noun'] = ['Z0', 'Z3']
meta_data = mwe_collection['*_noun boot*_noun']
assert 2 == meta_data.n_gram_length
assert LexiconType.MWE_WILDCARD == meta_data.lexicon_type
assert 2 == meta_data.wildcard_count
most_likely_tag = meta_data.semantic_tags[0]
assert most_likely_tag == 'Z0'
least_likely_tag = meta_data.semantic_tags[-1]
assert least_likely_tag == 'Z3'
# change defaultdict to dict so the dictionary is easier to read and understand
assert ({k: dict(v) for k, v in mwe_collection.mwe_regular_expression_lookup.items()}
== {2: {'*': {'*_noun boot*_noun': re.compile('[^\\s_]*_noun\\ boot[^\\s_]*_noun')}}})
mwe_match
class MWELexiconCollection(MutableMapping):
| ...
| def mwe_match(
| self,
| mwe_template: str,
| mwe_type: LexiconType
| ) -> List[str]
Returns a List of MWE templates, with the given mwe_type, that match
the given mwe_template. If there are no matches the returned List
will be empty.
This method applies all of the special syntax rules that should be applied e.g. wildcards allow zero or more characters to appear after the word token and/or Part Of Speech (POS) tag. For more information on the MWE special syntax rules see the following notes.
Parameters¶
- mwe_template :
str
The MWE template that you want to match against, e.g.river_noun bank_nounorski_noun boots_noun - mwe_type :
LexiconType
The type of MWE templates that you want to return.
Returns¶
Optional[List[str]]
Examples¶
from pymusas.lexicon_collection import MWELexiconCollection, LexiconType
collection = MWELexiconCollection({'walking_noun boot_noun': ['Z2'], 'ski_noun boot_noun': ['Z2'], '*_noun boot_noun': ['Z2'], '*_noun *_noun': ['Z2']})
assert [] == collection.mwe_match('river_noun bank_noun', LexiconType.MWE_NON_SPECIAL)
assert ['walking_noun boot_noun'] == collection.mwe_match('walking_noun boot_noun', LexiconType.MWE_NON_SPECIAL)
assert ['*_noun boot_noun', '*_noun *_noun'] == collection.mwe_match('walking_noun boot_noun', LexiconType.MWE_WILDCARD)
to_dictionary
class MWELexiconCollection(MutableMapping):
| ...
| def to_dictionary() -> Dict[str, List[str]]
Returns a dictionary of all MWE templates, the keys, stored in the collection and their associated semantic tags, the values.
This can then be used to re-create a MWELexiconCollection.
Returns¶
Dict[str, List[str]]
Examples¶
from pymusas.lexicon_collection import (MWELexiconCollection,
LexiconType, LexiconMetaData)
mwe_collection = MWELexiconCollection()
mwe_collection['*_noun boot*_noun'] = ['Z0', 'Z3']
assert (mwe_collection['*_noun boot*_noun']
== LexiconMetaData(['Z0', 'Z3'], 2, LexiconType.MWE_WILDCARD, 2))
assert (mwe_collection.to_dictionary()
== {'*_noun boot*_noun': ['Z0', 'Z3']})
to_bytes
class MWELexiconCollection(MutableMapping):
| ...
| def to_bytes() -> bytes
Serialises the MWELexiconCollection to a bytestring.
Returns¶
bytes
from_bytes
class MWELexiconCollection(MutableMapping):
| ...
| @staticmethod
| def from_bytes(bytes_data: bytes) -> "MWELexiconCollection"
Loads MWELexiconCollection from the given bytestring and
returns it.
Parameters¶
- bytes_data :
bytes
The bytestring to load.
Returns¶
from_tsv
class MWELexiconCollection(MutableMapping):
| ...
| @staticmethod
| def from_tsv(
| tsv_file_path: Union[PathLike, str]
| ) -> Dict[str, List[str]]
Given a tsv_file_path it will return a dictionary object
that can be used to create a MWELexiconCollection.
Each line in the TSV file will be read in and added to a temporary
MWELexiconCollection, once all lines
in the TSV have been parsed, the return value is the data attribute of
the temporary MWELexiconCollection.
If the file path is a URL, the file will be downloaded and cached using
pymusas.file_utils.download_url_file.
Code reference, the identification of a URL and the idea to do this has come from the AllenNLP library
Parameters¶
-
tsv_file_path :
Union[PathLike, str]
A file path or URL to a TSV file that contains at least these two fields:mwe_template,semantic_tags
All other fields will be ignored.
Returns¶
Dict[str, List[str]]
Raises¶
ValueError
If the minimum field headings,mwe_templateandsemantic_tags, do not exist in the given TSV file.
Examples¶
from pymusas.lexicon_collection import MWELexiconCollection
portuguese_lexicon_url = 'https://raw.githubusercontent.com/UCREL/Multilingual-USAS/master/Portuguese/mwe-pt.tsv'
mwe_lexicon_dict = MWELexiconCollection.from_tsv(portuguese_lexicon_url)
mwe_lexicon_collection = MWELexiconCollection(mwe_lexicon_dict)
assert mwe_lexicon_dict['abaixo_adv de_prep'][0] == 'M6'
assert mwe_lexicon_dict['arco_noun e_conj flecha_noun'][0] == 'K5.1'
tsv_merge
class MWELexiconCollection(MutableMapping):
| ...
| @staticmethod
| def tsv_merge(*tsv_file_paths: PathLike) -> dict[str, list[str]]
Given one or more TSV files it will create a dictionary
object that can be used to create a MWELexiconCollection whereby
this dictionary is the combination of all of the lexicon information
in the TSV files.
Note the data is loaded in list order therefore the last TSV file
will take precedence, i.e. if the last TSV file contains
London_* city_*: [Z3] and the first TSV file contains
London_* city_*: [Z2] then the returned dictionary will only
contain the one entry; London_* city_*: [Z3].
Note if the POS tagset used in the TSV files are different this could cause issues during tag time.
Parameters¶
-
*tsv_file_paths :
Union[PathLike, str]
File paths or URLs to a TSV file that contains at least these two fields:mwe_template,semantic_tags
All other fields will be ignored.
Returns¶
dict[str, list[str]]
Raises¶
ValueError
If the minimum field headings,mwe_templateandsemantic_tags, do not exist in the given TSV file.
Examples¶
from pymusas.lexicon_collection import LexiconCollection
welsh_lexicon_url = "https://raw.githubusercontent.com/UCREL/Multilingual-USAS/refs/heads/master/Welsh/mwe-welsh.tsv"
english_lexicon_url = "https://raw.githubusercontent.com/UCREL/Multilingual-USAS/refs/heads/master/English/mwe-en.tsv"
tsv_urls = [welsh_lexicon_url, english_lexicon_url]
combined_lexicon_data = MWELexiconCollection.tsv_merge(*tsv_urls)
assert isinstance(combined_lexicon_data, dict)
assert combined_lexicon_data["Academy_NOUN Award_NOUN"] == ["A5.1+/K1"]
assert combined_lexicon_data["Ffwrnais_* Dyfi_*"] == ["Z2"]
escape_mwe
class MWELexiconCollection(MutableMapping):
| ...
| @staticmethod
| def escape_mwe(mwe_template: str) -> str
Returns the MWE template escaped so that it can be used in a regular expression.
The difference between this and the normal re.escape
method, is that we apply the re.escape method to the tokens in the
MWE template and then replace \* with [^\s_]* so that the wildcards
keep there original meaning with respect to the MWE special syntax rules.
Furthermore, the POS tags in the MWE template replace the * with
[^\s_]*.
Parameters¶
- mwe_template :
str
The MWE template that you want to escape, e.g.river_noun bank_nounor*_noun boot*_noun
Returns¶
str
Examples¶
from pymusas.lexicon_collection import MWELexiconCollection
mwe_escaped = MWELexiconCollection.escape_mwe('ano*_prep carta_noun')
assert r'ano[^\s_]*_prep\ carta_noun' == mwe_escaped
mwe_escaped = MWELexiconCollection.escape_mwe('ano_prep carta_*')
assert r'ano_prep\ carta_[^\s_]*' == mwe_escaped
__setitem__
class MWELexiconCollection(MutableMapping):
| ...
| def __setitem__(key: str, value: List[str]) -> None
Raises¶
ValueError
If using apos_mapperall POS tags within a MWE template cannot contain any wildcards or the POS tags can only be a wildcard, if this is not the case aValueErrorwill be raised.
__str__
class MWELexiconCollection(MutableMapping):
| ...
| def __str__() -> str
Human readable string.
__repr__
class MWELexiconCollection(MutableMapping):
| ...
| def __repr__() -> str
Machine readable string. When printed and run eval() over the string
you should be able to recreate the object.
__eq__
class MWELexiconCollection(MutableMapping):
| ...
| def __eq__(other: object) -> bool
Given another object to compare too it will return True if the other
object is the same class and contains the same meta_data and
pos_mapper instance attributes.
Parameters¶
- other :
object
The object to compare too.
Returns¶
True