redshred.enrichments package

Submodules

redshred.enrichments.base module

class redshred.enrichments.base.PerspectiveConfiguration(*, name: str, perspective: str, segments: SegmentQuery = None, description: str = '', config: Dict[str, Any] = None, debug: bool = False, **extra_data: Any)[source]

Bases: BaseModel

A model representing a perspective configuration.

This model is used to define a perspective configuration, which includes the name, perspective, segments, description, config, and debug.

Attributes:

name: The canonical enrichment name to run. perspective: A human-readable label which must be unique. segments: A queries to run against the server, as well as any required prerequisites to process. description: A description of the perspective. config: Specific configuration options for the enrichment. debug: Always run if True.

class Config[source]

Bases: _DefaultPydanticConfig

config: Dict[str, Any]
debug: bool
description: str
name: str
perspective: str
segments: SegmentQuery
class redshred.enrichments.base.SegmentQuery(*, queries: List[str] | Dict[str, Any] = None, prerequisites: List[str] = None, **extra_data: Any)[source]

Bases: BaseModel

A model representing a segment query.

This model is used to define a segment query, which includes a list of queries and prerequisites and is used when defining input for a Perspective.

Attributes:

queries: A list of queries. prerequisites: A list of prerequisites in the form of perspective names.

class Config[source]

Bases: _DefaultPydanticConfig

prerequisites: List[str]
queries: List[str] | Dict[str, Any]

redshred.enrichments.defined_acronyms module

class redshred.enrichments.defined_acronyms.DefinedAcronymsPerspective(*, name: Literal['defined_acronyms'] = 'defined_acronyms', perspective: str, segments: SegmentQuery = None, description: str = '', config: DefinedAcronymsPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

This enrichment can extract acronyms and their abbreviations from a piece of text. Example: Alcoholics Anonymous (AA)

config: DefinedAcronymsPerspectiveConfig
name: Literal['defined_acronyms']
class redshred.enrichments.defined_acronyms.DefinedAcronymsPerspectiveConfig[source]

Bases: BaseModel

This enrichment can extract acronyms and their abbreviations from a piece of text. Example: Alcoholics Anonymous (AA)

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
redshred.enrichments.defined_acronyms.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.external_api module

class redshred.enrichments.external_api.ExternalAPIPerspective(*, name: Literal['external_api'] = 'external_api', perspective: str, segments: SegmentQuery = None, description: str = '', config: ExternalAPIPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Config class for the External API

config: ExternalAPIPerspectiveConfig
name: Literal['external_api']
class redshred.enrichments.external_api.ExternalAPIPerspectiveConfig(*, template: str, jmespath_source: str = '', template_variables: Dict = None, jmespath_transformation: str = '')[source]

Bases: BaseModel

Config class for the External API

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
jmespath_source: str
jmespath_transformation: str
template: str
template_variables: Dict
redshred.enrichments.external_api.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.grouper module

class redshred.enrichments.grouper.GrouperPerspective(*, name: Literal['grouper'] = 'grouper', perspective: str, segments: SegmentQuery = None, description: str = '', config: GrouperPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Sane default BaseModel

config: GrouperPerspectiveConfig
name: Literal['grouper']
class redshred.enrichments.grouper.GrouperPerspectiveConfig(*, operations: str = 'xy', operation_labels: List[str] = None, root_label: str = 'root', x_gap: str = '1%', y_gap: str = '1%', whitespace_calculation_method: str = 'average', whitespace_method_options: Dict = None, hull_method: GrouperPerspectiveHullMethod = 'convex_hull', hull_method_options: Dict = None)[source]

Bases: BaseModel

Sane default BaseModel

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
hull_method: GrouperPerspectiveHullMethod
hull_method_options: Dict
operation_labels: List[str]
operations: str
root_label: str
whitespace_calculation_method: str
whitespace_method_options: Dict
x_gap: str
y_gap: str
class redshred.enrichments.grouper.GrouperPerspectiveHullMethod(value)[source]

Bases: str, Enum

Method for calculating the hull of the group, either convex_hull, concave_hull, bounding_box, minimum_rotated_rectangle, or as None/null to return full multipolygon output

bounding_box = 'bounding_box'
concave_hull = 'concave_hull'
convex_hull = 'convex_hull'
minimum_rotated_rectangle = 'minimum_rotated_rectangle'
redshred.enrichments.grouper.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.huggingface module

class redshred.enrichments.huggingface.HuggingfacePerspective(*, name: Literal['huggingface'] = 'huggingface', perspective: str, segments: SegmentQuery = None, description: str = '', config: HuggingfacePerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Sane default BaseModel

config: HuggingfacePerspectiveConfig
name: Literal['huggingface']
class redshred.enrichments.huggingface.HuggingfacePerspectiveConfig(*, pipeline_task: str, task_specific_template: str = '', tokenizer: str = '', model: str, model_source: str = 'huggingface', tokenizer_class: str = 'auto', model_class: str = 'auto', task_config: Dict = None, task_config_class: str = 'auto')[source]

Bases: BaseModel

Sane default BaseModel

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
model: str
model_class: str
model_source: str
pipeline_task: str
task_config: Dict
task_config_class: str
task_specific_template: str
tokenizer: str
tokenizer_class: str
redshred.enrichments.huggingface.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.iris module

class redshred.enrichments.iris.IrisPerspective(*, name: Literal['iris'] = 'iris', perspective: str, segments: SegmentQuery = None, description: str = '', config: IrisPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Enrichment configuration documentation Make sure you fill this out fully, since if it is not included here the configuration will not work

config: IrisPerspectiveConfig
name: Literal['iris']
class redshred.enrichments.iris.IrisPerspectiveConfig(*, model: str = 'iris-v0.01', confidence_threshold: float = 0.8, nms_threshold: float = 0.8)[source]

Bases: BaseModel

Enrichment configuration documentation Make sure you fill this out fully, since if it is not included here the configuration will not work

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
confidence_threshold: float
model: str
nms_threshold: float
redshred.enrichments.iris.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.page_images module

class redshred.enrichments.page_images.BackendOptions(*, cropbox: bool = True)[source]

Bases: BaseModel

cropbox: bool
class redshred.enrichments.page_images.PageImagesPerspective(*, name: Literal['page_images'] = 'page_images', perspective: str, segments: SegmentQuery = None, description: str = '', config: PageImagesPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Generate preview images for all pages.

config: PageImagesPerspectiveConfig
name: Literal['page_images']
class redshred.enrichments.page_images.PageImagesPerspectiveBackend(value)[source]

Bases: str, Enum

PDF image generator backend to use

pdfium = 'pdfium'
pdftoppm = 'pdftoppm'
class redshred.enrichments.page_images.PageImagesPerspectiveConfig(*, dpi: int = 150, backend: PageImagesPerspectiveBackend = 'pdftoppm', antialiasing: bool = True, backend_options: dict = None, show_annotations: bool = True, pages: list[str | int] | None = None)[source]

Bases: BaseModel

Generate preview images for all pages.

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
antialiasing: bool
backend: PageImagesPerspectiveBackend
backend_options: dict
dpi: int
pages: list[str | int] | None
show_annotations: bool
redshred.enrichments.page_images.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.pdftotext module

class redshred.enrichments.pdftotext.PdftotextPerspective(*, name: Literal['pdftotext'] = 'pdftotext', perspective: str, segments: SegmentQuery = None, description: str = '', config: PdftotextPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Generate preview images for all pages.

config: PdftotextPerspectiveConfig
name: Literal['pdftotext']
class redshred.enrichments.pdftotext.PdftotextPerspectiveConfig(*, layout: bool = True)[source]

Bases: BaseModel

Generate preview images for all pages.

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
layout: bool
redshred.enrichments.pdftotext.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.preprocess module

class redshred.enrichments.preprocess.LemmatizingStep(*, operation: str = 'lemmatize', comment: str = '', method: LemmatizingStepMethod = 'wordnet')[source]

Bases: BaseModel

Sane default BaseModel

comment: str
method: LemmatizingStepMethod
operation: str
class redshred.enrichments.preprocess.LemmatizingStepMethod(value)[source]

Bases: str, Enum

An enumeration.

wordnet = 'wordnet'
class redshred.enrichments.preprocess.PreprocessPerspective(*, name: Literal['preprocess'] = 'preprocess', perspective: str, segments: SegmentQuery = None, description: str = '', config: PreprocessPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Configuration for the RegEx enrichment

config: PreprocessPerspectiveConfig
name: Literal['preprocess']
class redshred.enrichments.preprocess.PreprocessPerspectiveConfig(*, steps: List[StringMethodStep | RegexStep | StopwordStep | StemmingStep | LemmatizingStep])[source]

Bases: BaseModel

Configuration for the RegEx enrichment

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
steps: List[StringMethodStep | RegexStep | StopwordStep | StemmingStep | LemmatizingStep]
class redshred.enrichments.preprocess.RegexStep(*, operation: str = 'preprocessing-pipeline_replace', comment: str = '', pattern: str, replacement: str, flags: int = 0)[source]

Bases: BaseModel

A single step in the preprocessing pipeline

comment: str
flags: int
operation: str
pattern: str
replacement: str
class redshred.enrichments.preprocess.StemmingStep(*, operation: str = 'stem', comment: str = '', method: StemmingStepMethod = 'porter')[source]

Bases: BaseModel

Sane default BaseModel

comment: str
method: StemmingStepMethod
operation: str
class redshred.enrichments.preprocess.StemmingStepMethod(value)[source]

Bases: str, Enum

An enumeration.

porter = 'porter'
class redshred.enrichments.preprocess.StopwordStep(*, operation: str = 'stopword', comment: str = '', stopwords: StopwordStepStopwords = 'nltk', case_sensitive: bool = False, normalize_whitespace: bool = True)[source]

Bases: BaseModel

A single step in the preprocessing pipeline

case_sensitive: bool
comment: str
normalize_whitespace: bool
operation: str
stopwords: StopwordStepStopwords
class redshred.enrichments.preprocess.StopwordStepStopwords(value)[source]

Bases: str, Enum

An enumeration.

nltk = 'nltk'
class redshred.enrichments.preprocess.StringMethodStep(*, operation: str = 'string_method', comment: str = '', method: StringMethodStepMethod)[source]

Bases: BaseModel

A single step in the preprocessing pipeline

comment: str
method: StringMethodStepMethod
operation: str
class redshred.enrichments.preprocess.StringMethodStepMethod(value)[source]

Bases: str, Enum

An enumeration.

capitalize = 'capitalize'
lower = 'lower'
title = 'title'
upper = 'upper'
redshred.enrichments.preprocess.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.regex module

class redshred.enrichments.regex.RegexPattern(*, label: str, flags: str = '', pattern: str)[source]

Bases: BaseModel

Each Regex Pattern defined here will be queried against the input text(s). This allows you to define multiple patterns that you want to run across a single group of inputs in a single batch.

flags: str
label: str
pattern: str
class redshred.enrichments.regex.RegexPerspective(*, name: Literal['regex'] = 'regex', perspective: str, segments: SegmentQuery = None, description: str = '', config: RegexPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Configuration settings for the RegEx Enrichment

config: RegexPerspectiveConfig
name: Literal['regex']
class redshred.enrichments.regex.RegexPerspectiveConfig(*, search_patterns: List[RegexPattern])[source]

Bases: BaseModel

Configuration settings for the RegEx Enrichment

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
search_patterns: List[RegexPattern]
redshred.enrichments.regex.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.sentences module

class redshred.enrichments.sentences.SentencesPerspective(*, name: Literal['sentences'] = 'sentences', perspective: str, segments: SegmentQuery = None, description: str = '', config: SentencesPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Generate preview images for all pages.

config: SentencesPerspectiveConfig
name: Literal['sentences']
class redshred.enrichments.sentences.SentencesPerspectiveConfig(*, method: SentencesPerspectiveMethod = 'nltk')[source]

Bases: BaseModel

Generate preview images for all pages.

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
method: SentencesPerspectiveMethod
class redshred.enrichments.sentences.SentencesPerspectiveMethod(value)[source]

Bases: str, Enum

An enumeration.

nltk = 'nltk'
pysbd = 'pysbd'
spacy = 'spacy'
redshred.enrichments.sentences.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.spacy module

class redshred.enrichments.spacy.SpacyPerspective(*, name: Literal['spacy'] = 'spacy', perspective: str, segments: SegmentQuery = None, description: str = '', config: SpacyPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Configuration for the RegEx enrichment

config: SpacyPerspectiveConfig
name: Literal['spacy']
class redshred.enrichments.spacy.SpacyPerspectiveConfig(*, text_field: str = '$.text', model: str = 'en_core_web_sm', task: SpacyPerspectiveTask = 'noun_chunks')[source]

Bases: BaseModel

Configuration for the RegEx enrichment

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
model: str
task: SpacyPerspectiveTask
text_field: str
class redshred.enrichments.spacy.SpacyPerspectiveTask(value)[source]

Bases: str, Enum

An enumeration.

dep = 'dep'
morphology = 'morphology'
ner = 'ner'
noun_chunks = 'noun_chunks'
pos = 'pos'
spans = 'spans'
vectors = 'vectors'
redshred.enrichments.spacy.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.tfidf module

class redshred.enrichments.tfidf.TFIDFPerspective(*, name: Literal['tfidf'] = 'tfidf', perspective: str, segments: SegmentQuery = None, description: str = '', config: TFIDFPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

Sane default BaseModel

config: TFIDFPerspectiveConfig
name: Literal['tfidf']
class redshred.enrichments.tfidf.TFIDFPerspectiveConfig(*, stop_words: None = None, norm: TFIDFPerspectiveNorm = 'l2', ngram_range: List[int] = None, lowercase: bool = True, max_df: float = 1.0, min_df: None = None, max_features: int = 5000)[source]

Bases: BaseModel

Sane default BaseModel

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
lowercase: bool
max_df: float
max_features: int
min_df: None
ngram_range: List[int]
norm: TFIDFPerspectiveNorm
stop_words: None
class redshred.enrichments.tfidf.TFIDFPerspectiveNorm(value)[source]

Bases: str, Enum

See ‘https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html’ for documentation

l1 = 'l1'
l2 = 'l2'
redshred.enrichments.tfidf.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

redshred.enrichments.typography module

class redshred.enrichments.typography.TypographyPerspective(*, name: Literal['typography'] = 'typography', perspective: str, segments: SegmentQuery = None, description: str = '', config: TypographyPerspectiveConfig = None, debug: bool = False, **extra_data: Any)[source]

Bases: PerspectiveConfiguration

config: TypographyPerspectiveConfig
name: Literal['typography']
class redshred.enrichments.typography.TypographyPerspectiveConfig(*, options: Dict = None, segmenter: TypographyPerspectiveSegmenter = 'pdftotext')[source]

Bases: BaseModel

class Config[source]

Bases: object

arbitrary_types_allowed = True
use_enum_values = True
options: Dict
segmenter: TypographyPerspectiveSegmenter
class redshred.enrichments.typography.TypographyPerspectiveSegmenter(value)[source]

Bases: str, Enum

An enumeration.

pdfminer = 'pdfminer'
pdftotext = 'pdftotext'
tesseract = 'tesseract'
redshred.enrichments.typography.object_setattr(self, name, value, /)

Implement setattr(self, name, value).

Module contents