The UCREL Doc class holds text level lingustic information which is stored as a list of UCREL Token instances.
from ucrel_api.ucrel_token import UCREL_Token
DOC_TOKENS = [UCREL_Token('hello', pos_tag='UH', usas_tag='Z4'),
UCREL_Token('how', 'RRQ', 'Z5'),
UCREL_Token('are', 'VBR', 'A3+'), UCREL_Token('you', 'PPY', 'Z8mf'),
UCREL_Token('.', '.', None), UCREL_Token('I', 'PPIS1', 'Z8mf'),
UCREL_Token('am', 'VBM', 'A3+'), UCREL_Token('great', 'JJ', 'A5.1+'),
UCREL_Token('thanks', 'NN2', 'S1.2.4+'), UCREL_Token('.', '.', None)]
example_doc = UCREL_Doc(text='hello how are you. I am great thanks.',
tokens=DOC_TOKENS, sentence_indexes=[(0,5), (5,10)])
example_doc
for index, sentence in enumerate(example_doc.sentences):
print(f'Sentence {index}:')
for token in sentence:
print(f'{token}')
if index == 0:
print('\n')
for index, token in enumerate(example_doc):
print(f'{index} {token}')
example_doc[-2]
len(example_doc)
assert example_doc == UCREL_Doc(text='hello how are you. I am great thanks.',
tokens=DOC_TOKENS,
sentence_indexes=[(0,5), (5,10)])
example_without_sent_indexes = UCREL_Doc(text='hello how are you. I am great thanks.',
tokens=DOC_TOKENS)
assert example_doc != example_without_sent_indexes
try:
{'text': 'hello how are you. I am great thanks.',
'tokens': DOC_TOKENS, 'sentence_indexes': [(0,5), (5,10)]} == example_doc
except NotImplementedError:
print('UCREL_Doc instances can only be compared '
'with other UCREL_Doc instances:')
example_doc.to_json()
example_doc_json_string = example_doc.to_json()
another_example_doc = UCREL_Doc.from_json(example_doc_json_string)
another_example_doc
example_doc == another_example_doc