redshred.models package

Submodules

redshred.models.api module

class redshred.models.api.APIObjectIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: Iterator

count: int | None

first(): Return the next item from the iterator. When exhausted, raise StopIteration

iter_dict(**dict_args)[source]

path: str | None

query: str

to_list()[source]

class redshred.models.api.ApiObject(*, self_link: str)[source]

Bases: SerializableModel

create(client) → TApiObject[source]

dashboard(query=None) → str[source]: returns a url to the RedShred dashboard representation of the object (if supported)

delete() → TApiObject[source]: Delete the remote object

classmethod load(client: 'redshred.api.http.RedShredAPI' | 'redshred.api.client.RedShredClient', url: str = None, collection: str | Collection = None, object_id: str = None, **key_word_filters) → TApiObject[source]

q(query: str, search_type: Literal['documents', 'pages', 'perspectives', 'segments'] = 'segments', **url_params)[source]

read() → TApiObject[source]: Update the object with the remote state

self_link: str

update(**kwargs) → TApiObject[source]: Update the remote object with the current local state.

class redshred.models.api.Collection(*, self_link: str = None, id: str = None, config: CollectionConfiguration | None = None, created_at: datetime = None, created_by: str = None, description: str | None = None, documents_link: str = None, marked_for_delete: bool | None = False, metadata: None = None, name: str = None, owner: str = None, perspectives_link: str = None, segments_link: str = None, slug: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, client: Any = None)[source]

Bases: ApiObject

client: Any

config: CollectionConfiguration | None

create(client: 'redshred.api.http.RedShredAPI' | 'redshred.api.client.RedShredClient') → TApiObject[source]

Create the local object on the remote server

Args:: client (RedShredClient, RedShredAPI): the client to use to create the object

created_at: datetime.datetime

created_by: str

delete()[source]: Delete the remote object

description: str | None

document(document_id) → Document[source]

documents(q: str | None = None, fields: List[str] | None = None, **url_params) → DocumentIterator[Document][source]

documents_link: str

id: str

classmethod load(client: 'redshred.api.http.RedShredAPI' | 'redshred.api.client.RedShredClient', slug: str = None, url: str = None) → Collection[source]

marked_for_delete: bool | None

metadata: None

name: str

owner: str

perspective(perspective_id) → Perspective[source]

perspectives(q: str | None = None, fields: List[str] | None = None, **url_params) → PerspectiveIterator[Perspective][source]

perspectives_link: str

segment(segment_id) → Segment[source]

segments(q: str | None = None, fields: List[str] | None = None, **url_params) → SegmentIterator[Segment][source]

segments_link: str

self_link: str

slug: str

updated_at: datetime.datetime

updated_by: str

upload_csv(file, content_columns: List[str], delimiter=',', rename: str | None = None, **user_data)[source]

Args:

file: source file to upload content_columns: Used for multi-document upload via CSV. A list which specifies the column(s)

that will be used for the document body.

delimiter: delimiter for csv, defaults to “,” rename: a new name if desired **user_data: any additional user data

Returns:

None

upload_file(file: Path | BufferedReader, rename: str | None = None, save_origin: bool | None = False, **user_data) → Document[source]

Convenience method to upload a filelike into RedShred.

Args:: collection_link (str): Target collection to upload file to file (str, filelike): Either a filename, url of file, or open() filelike object rename (str, optional): File name override. Defaults to existing filename save_origin (bool, optional): Save the path to the file on disk. Defaults to False user_data (dict): arbitrary dictionary to store with document on server
Raises:: ValueError: Name argument missing for URL upload
Returns:: dict: Returned payload from API server

upload_text(text: str, name: str, **user_data) → Document[source]

Convenience method to upload raw text into RedShred.

Given a collection name and a url, upload that text into RedShred.

Args:: text (str): Text to upload. name (str, optional): File name to save text as. user_data (dict): arbitrary dictionary to store with document on server
Returns:: dict: Returned payload from API server

upload_url(url: str, rename: str | None = None, save_origin: bool | None = True, **user_data) → Document[source]

Convenience method to upload a URL into RedShred.

Given a collection name and a url, upload that file into RedShred.

Args:: collection_link (str): Target collection to upload file to url (str, filelike): Url of file to upload. rename (str, optional): File name override. Defaults to existing filename save_origin (bool, optional): Save the url to the file. Defaults to True user_data (dict): arbitrary dictionary to store with document on server
Raises:: ValueError: Name argument missing for URL upload
Returns:: dict: Returned payload from API server

user_data: dict | None

class redshred.models.api.CollectionIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None

path: str | None

query: str

class redshred.models.api.Document(*, self_link: str = None, id: str = None, collection_link: str = None, collection_slug: str = None, config: CollectionConfiguration | None = None, content_hash: str = None, created_at: datetime = None, created_by: str = None, csv_metadata: dict | None = None, description: str | None = None, document_segment_link: str = None, errors: dict | str | None = None, file_link: str = None, file_size: int = None, index: int = None, metadata: dict | None = None, n_pages: int = None, name: str = None, original_name: str = None, pages_link: str = None, pdf_link: str = None, perspectives_link: str = None, read_state: str = None, read_state_updated_at: datetime = None, region: GeoJSON = None, segments_link: str = None, slug: str = None, source: str = None, summary: str = None, text: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, warnings: dict | None = None, uniqueness_id: str | None = None)[source]

Bases: ApiObject

collection() → Collection[source]

collection_link: str

collection_slug: str

config: CollectionConfiguration | None

content_hash: str

create(*args, **kwargs)[source]

created_at: datetime.datetime

created_by: str

csv_metadata: dict | None

description: str | None

document_segment_link: str

download(path: str | 'pathlib.Path') → int[source]

Download the original_file uploaded to RedShred to the specified path, returning the total bytes written

Args:: path: a path to somewhere on the local filesystem

Returns: number of bytes written

download_bytes() → bytes[source]

Download the original_file uploaded to RedShred to the specified path, returning the total bytes written

Returns: document as bytes

errors: dict | str | None

file_link: str

file_size: int

id: str

index: int

metadata: dict | None

n_pages: int

name: str

original_name: str

page(index) → Page[source]

pages(q: str | None = None, fields: List[str] | None = None, **url_params) → PageIterator[Page][source]

pages_link: str

pdf_link: str

perspective(perspective_id) → Perspective[source]

perspectives(q: str | None = None, fields: List[str] | None = None, **url_params) → PerspectiveIterator[Perspective][source]

perspectives_link: str

read_state: str

read_state_updated_at: datetime.datetime

region: GeoJSON

reread_document(force=False)[source]

segment(segment_id) → Perspective[source]

segments(q: str | None = None, fields: List[str] | None = None, **url_params) → SegmentIterator[Segment][source]

segments_link: str

self_link: str

slug: str

source: str

summary: str

text: str

uniqueness_id: str | None

updated_at: datetime.datetime

updated_by: str

user_data: dict | None

wait_until_read(wait_time_seconds: int = 5)[source]

warnings: dict | None

class redshred.models.api.DocumentIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None

path: str | None

query: str

class redshred.models.api.Page(*, self_link: str = None, collection_link: str = None, collection_slug: str = None, content_hash: str = None, created_at: datetime = None, created_by: str = None, document_index: int = None, document_name: str = None, dpi: int = None, height: float = None, id: str = None, index: int = None, metadata: dict | None = None, name: str = None, page_segment_link: str = None, perspectives_link: str = None, region: GeoJSON = None, segments_link: str = None, summary: str = None, text: str = None, tokens_file_link: str = None, units: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, width: float = None)[source]

Bases: ApiObject

collection()[source]

collection_link: str

collection_slug: str

content_hash: str

created_at: datetime.datetime

created_by: str

document()[source]

document_index: int

document_name: str

dpi: int

height: float

id: str

index: int

metadata: dict | None

name: str

next()[source]

page_segment_link: str

perspective(perspective_id) → Perspective[source]

perspectives(q: str | None = None, fields: List[str] | None = None, **url_params) → PerspectiveIterator[Perspective][source]

perspectives_link: str

previous()[source]

region: GeoJSON

segment(segment_id) → Segment[source]

segments(q: str | None = None, fields: List[str] | None = None, **url_params) → SegmentIterator[Segment][source]

segments_link: str

self_link: str

summary: str

text: str

tokens()[source]

tokens_file_link: str

units: str

updated_at: datetime.datetime

updated_by: str

user_data: dict | None

width: float

class redshred.models.api.PageIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None

path: str | None

query: str

class redshred.models.api.Perspective(*, self_link: str = None, name: str = None, enrichment_name: str = None, collection_link: str = None, collection_slug: str = None, created_at: datetime = None, created_by: str = None, document_link: str = None, description: str | None = None, document_name: str = None, enrichment_config: dict = None, errors: dict | None = None, id: str = None, metadata: dict | None = None, segment_types: list = None, segments_link: str = None, slug: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, warnings: dict | None = None, cache_id: str | None = None)[source]

Bases: ApiObject

bulk_create_segments(segments: List[dict | Segment], batch_size=128)[source]

cache_id: str | None

collection()[source]

collection_link: str

collection_slug: str

create(collection: str | Collection | None = None, document: Document | None = None, client: 'redshred.api.http.RedShredAPI' | 'redshred.api.client.RedShredClient' | None = None)[source]: Create the local object on the remote server

created_at: datetime.datetime

created_by: str

description: str | None

document()[source]

document_link: str

document_name: str

enrichment_config: dict

enrichment_name: str

errors: dict | None

id: str

metadata: dict | None

name: str

segment(segment_id) → Segment[source]

segment_types: list

segments(q: str | None = None, fields: List[str] | None = None, **url_params) → SegmentIterator[Segment][source]

segments_link: str

self_link: str

slug: str

updated_at: datetime.datetime

updated_by: str

user_data: dict | None

warnings: dict | None

class redshred.models.api.PerspectiveIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None

path: str | None

query: str

class redshred.models.api.RedShredUser(*, active: bool, email: str, first_name: str, joined: datetime, last_login: datetime, last_name: str, staff: bool, super: bool, username: str, token: str)[source]

Bases: SerializableModel

active: bool

email: str

first_name: str

joined: datetime

last_login: datetime

last_name: str

staff: bool

super: bool

token: str

username: str

class redshred.models.api.Segment(*, self_link: str = None, segment_type: str = None, regions: GeoJSON = None, bounding_box: BoundingBox | None = None, collection_link: str = None, collection_slug: str = None, created_at: datetime = None, created_by: str = None, document_link: str = None, document_name: str = None, enrichment_data: dict | None = None, enrichment_name: str = None, errors: dict | None = None, id: str = None, labels: list = None, metadata: dict | None = None, max_x: float | None = None, max_y: float | None = None, min_x: float | None = None, min_y: float | None = None, perspective_link: str = None, summary: str = None, text: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, warnings: dict | None = None, cache_id: str | None = None)[source]

Bases: ApiObject

between(segment: Segment, strict: bool = False) → BoundingBox[source]

Provides a helper function to generate the bounding box between two segments.

Args:: segment (Segment): A segment to define the space strict (bool, optional): If True, returned bounding box will be area exatly between the two segments. If False, the bounding box returned will be the entire page width between two segments. Defaults to False.
Returns:: list: Bounding box of the area between two segments.

bounding_box: BoundingBox | None

cache_id: str | None

collection()[source]

collection_link: str

collection_slug: str

create(perspective: Perspective | None = None)[source]: Create the local object on the remote server

created_at: datetime.datetime

created_by: str

document()[source]

document_link: str

document_name: str

enrichment_data: dict | None

enrichment_name: str

errors: dict | None

get_segment_image(path_to_save_folder: str | None = None, return_bytes=False, inline=False, **url_params) → bytes | str[source]

get_segments_from_perspective(perspective_name: str, **params)[source]

get_text(**url_params)[source]

id: str

labels: list

max_x: float | None

max_y: float | None

metadata: dict | None

min_x: float | None

min_y: float | None

perspective()[source]

perspective_link: str

q(query: str, search_type: Literal['documents', 'pages', 'perspectives', 'segments'] = 'segments', **url_params)[source]

regions: GeoJSON

segment_type: str

self_link: str

summary: str

text: str

updated_at: datetime.datetime

updated_by: str

user_data: dict | None

warnings: dict | None

class redshred.models.api.SegmentIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None

path: str | None

query: str

class redshred.models.api.SerializableModel[source]

Bases: BaseModel

class Config[source]

Bases: object

arbitrary_types_allowed = True

extra = 'forbid'

json_encoders = {<class 'datetime.datetime'>: <function SerializableModel.Config.<lambda>>, <class 'redshred.spatial.GeoJSON'>: <class 'dict'>, <class 'redshred.spatial.BoundingBox'>: <class 'dict'>}

underscore_attrs_are_private = True

use_enum_values = True

classmethod dict_to_geojson(v)[source]

classmethod list_to_bbox(v)[source]

classmethod lists_to_bboxes(v)[source]

yaml(include=None, exclude=None, indent=None)[source]

class redshred.models.api.Token(*, index: int, text: str, text_with_ws: str | None = None, bboxes: List[BoundingBox], regions: GeoJSON, metadata: dict | None = None, rotation: int = 0)[source]

Bases: SerializableModel

class Config[source]

Bases: Config

extra = 'ignore'

bboxes: List[BoundingBox]

index: int

metadata: dict | None

regions: GeoJSON

rotation: int

text: str

text_with_ws: str | None

redshred.models.api.get_type(name)[source]

redshred.models.configuration module

class redshred.models.configuration.ChoiceSegmenter(value)[source]

Bases: str, Enum

An enumeration.

pdfminer = 'pdfminer'

pdftotext = 'pdftotext'

tesseract = 'tesseract'

class redshred.models.configuration.ChoiceSentenceTokenizer(value)[source]

Bases: str, Enum

An enumeration.

nltk = 'nltk'

spacy = 'spacy'

Bases: BaseModel

Config: alias of _DefaultPydanticConfig

allow_anonymous_downloads: bool

dict(*, include: AbstractSetIntStr | MappingIntStrAny | None = None, exclude: AbstractSetIntStr | MappingIntStrAny | None = None, by_alias: bool = False, skip_defaults: bool | None = None, exclude_unset: bool = False, exclude_defaults: bool = False, exclude_none: bool = False) → DictStrAny: Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

document_uniqueness: DocumentUniqueness

enrichments: List[DefinedAcronymsPerspective | ExternalAPIPerspective | GrouperPerspective | HuggingfacePerspective | IrisPerspective | PageImagesPerspective | PdftotextPerspective | PreprocessPerspective | RegexPerspective | SentencesPerspective | SpacyPerspective | TFIDFPerspective | TypographyPerspective | PerspectiveConfiguration]

classmethod from_dict(config: Dict[str, Any])[source]

json(*, include: AbstractSetIntStr | MappingIntStrAny | None = None, exclude: AbstractSetIntStr | MappingIntStrAny | None = None, by_alias: bool = False, skip_defaults: bool | None = None, exclude_unset: bool = False, exclude_defaults: bool = False, exclude_none: bool = False, encoder: Callable[[Any], Any] | None = None, models_as_dict: bool = True, **dumps_kwargs: Any) → unicode

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

notifications: List[NotificationConfiguration]

tokenizer: List[Tokenizers | str] | Tokenizers | str

validate_remote_schema(client: redshred.api.client.RedShredClient)[source]

yaml(*, include: Set[str] | None = None, exclude: Set[str] | None = None, by_alias: bool = False, skip_defaults: bool | None = None, exclude_unset: bool = True, exclude_defaults: bool = False, exclude_none: bool = False, encoder: Callable[[Any], Any] | None = None, models_as_dict: bool = True, **dumps_kwargs: Any)[source]

Generate a YAML representation of the model from the JSON representation, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

class redshred.models.configuration.CustomSegmenter(*, name: ChoiceSegmenter, options: Dict[str, Any] = None)[source]

Bases: BaseModel

Config: alias of _DefaultPydanticConfig

name: ChoiceSegmenter

options: Dict[str, Any]

class redshred.models.configuration.DocumentUniqueness(value)[source]

Bases: str, Enum

An enumeration.

always = 'always'

contents = 'contents'

filename = 'filename'

class redshred.models.configuration.NotificationConfiguration(*, label: ConstrainedStrValue, recipients: ConstrainedListValue[str], query: str, condition: str = 'length(@) > `0`')[source]

Bases: BaseModel

Config: alias of _DefaultPydanticConfig

condition: str

label: str

query: str

recipients: List[str]

class redshred.models.configuration.PerspectiveConfiguration(*, name: str, perspective: str, segments: PerspectiveSegmentQuery | Dict = None, description: str = '', config: Dict[str, Any] = None, debug: bool = False)[source]

Bases: BaseModel

Config: alias of _DefaultPydanticConfig

config: Dict[str, Any]

debug: bool

description: str

name: str

perspective: str

segments: PerspectiveSegmentQuery | Dict

class redshred.models.configuration.PerspectiveSegmentQuery(*, queries: List[str] | Dict[str, Any] = None, prerequisites: List[str] = None)[source]

Bases: BaseModel

Config: alias of _DefaultPydanticConfig

prerequisites: List[str]

queries: List[str] | Dict[str, Any]

class redshred.models.configuration.Tokenizers(value)[source]

Bases: str, Enum

An enumeration.

pdfminer = 'pdfminer'

pdftotext = 'pdftotext'

tesseract = 'tesseract'

tet = 'tet'

class redshred.models.configuration.TypographyConfiguration(*, debug: bool = False, segmenter: ChoiceSegmenter | CustomSegmenter = 'pdfminer', sentence_tokenizer: ChoiceSentenceTokenizer | None = None)[source]

Bases: BaseModel

Config: alias of _DefaultPydanticConfig

debug: bool

segmenter: ChoiceSegmenter | CustomSegmenter

sentence_tokenizer: ChoiceSentenceTokenizer | None

redshred.models package

Submodules

redshred.models.api module

redshred.models.configuration module

redshred.models.enrichments module

Module contents