config =
{'dataset': {},
'pipeline': [{'fields': ['body'], 'step': 'loader', 'type': 'squirro_query'},
{'fields': ['body'],
'mark_as_skipped': True,
'step': 'filter',
'type': 'empty'},
{'cleaning': {'\tapprox.': ' approx',
'\netc.': ' etc',
') i.e.': '). ie'},
'. ('input_fields': [' (body'],
'</p>': ' ',
'<p>': ' ',
'approx.': 'approx',
'etc.': 'etc',
'i.e.': 'ie'},
'input_fields': ['body'],
'output_fields': ['extract_sentences'],
'rules': ['**',
'\n-',
'</h1>',
'</h2>',
'</h3>', 'output_fields': ['extract_sentences'],
'rules': ['<br/>**',
'...',
'…',
': '],
'step': 'tokenizer',
'type': 'sentences_nltk'},
{'fields': ['extract_sentences'],
'step': 'filter',
'type': 'doc_split'},
{'input_fields': ['extract_sentences'],
'output_fields': ['extract_sentences'],
'step': 'tokenizer',
'type': 'html'},
{'fields': ['extract_sentences'],
'step': 'filter',
'type': 'doc_split'},
{'input_fields': ['extract_sentences'],
'output_fields': ['sentences_normalized'],
'step': 'normalizer',
'type': 'html'},
{'fields': ['sentences_normalized'],
'mark_as_skipped': True,
'step': 'filter',
'type': 'regex',
'whitelist_regexes': ['^.{20,}$']},
{'blacklist_terms': [],
'fields': ['sentences_normalized'],
'matching_label': 'tax_rate1',
'name': './models/ais/proximity',
'non_matching_label': 'not_tax_rate1_tax_rate2',
'output_field': 'prediction_tax_rate1',
'step': 'filter',
'type': 'proximity',
'whitelist_terms': ['tax rate of~1|','tax rate~2|']},
{'blacklist_terms': [],
'fields': ['sentences_normalized'],
'matching_label': 'tax_rate2',
'name': './models/ais/proximity',
'non_matching_label': 'not_tax_rate1_tax_rate2',
'output_field': 'prediction_tax_rate2',
'step': 'filter',
'type': 'proximity',
'whitelist_terms': ['"tax rate of"~1~3rate~4|', 'tax rate~1|']},
{'delimiter': ',',
'input_fields': ['prediction_tax_rate1', 'prediction_tax_rate2'],
'output_field': 'prediction',
'step': 'filter',
'type': 'merge'},
{'input_field': 'prediction',
'output_field': 'prediction',
'step': 'filter',
'type': 'split'},
{'fields': ['sentences_normalized', 'prediction'],
'step': 'filter',
'type': 'doc_join'},
{'entity_name_field': 'Catalyst',
'entity_type': 'Catalyst',
'excluded_values': ['not_tax_rate1_tax_rate2'],
'extract_field': 'sentences_normalized',
'format_values': False,
'global_property_field_map': {},
'modes': ['process'],
'property_field_map': {'Catalyst': ['prediction']},
'required_properties': ['Catalyst'],
'source_field': 'body',
'step': 'filter',
'type': 'squirro_entity'}
]
} |