redshred.models package

Submodules

redshred.models.api module

class redshred.models.api.APIObjectIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: Iterator

count: int | None
first()

Return the next item from the iterator. When exhausted, raise StopIteration

iter_dict(**dict_args)[source]
path: str | None
query: str
to_list()[source]
class redshred.models.api.ApiObject(*, self_link: str)[source]

Bases: SerializableModel

create(client) TApiObject[source]
dashboard(query=None) str[source]

returns a url to the RedShred dashboard representation of the object (if supported)

delete() TApiObject[source]

Delete the remote object

classmethod load(client: 'redshred.api.http.RedShredAPI' | 'redshred.api.client.RedShredClient', url: str = None, collection: str | Collection = None, object_id: str = None, **key_word_filters) TApiObject[source]
q(query: str, search_type: Literal['documents', 'pages', 'perspectives', 'segments'] = 'segments', **url_params)[source]
read() TApiObject[source]

Update the object with the remote state

update(**kwargs) TApiObject[source]

Update the remote object with the current local state.

class redshred.models.api.Collection(*, self_link: str = None, id: str = None, config: CollectionConfiguration | None = None, created_at: datetime = None, created_by: str = None, description: str | None = None, documents_link: str = None, marked_for_delete: bool | None = False, metadata: None = None, name: str = None, owner: str = None, perspectives_link: str = None, segments_link: str = None, slug: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, client: Any = None)[source]

Bases: ApiObject

client: Any
config: CollectionConfiguration | None
create(client: 'redshred.api.http.RedShredAPI' | 'redshred.api.client.RedShredClient') TApiObject[source]

Create the local object on the remote server

Args:

client (RedShredClient, RedShredAPI): the client to use to create the object

created_at: datetime.datetime
created_by: str
delete()[source]

Delete the remote object

description: str | None
document(document_id) Document[source]
documents(q: str | None = None, fields: List[str] | None = None, **url_params) DocumentIterator[Document][source]
id: str
classmethod load(client: 'redshred.api.http.RedShredAPI' | 'redshred.api.client.RedShredClient', slug: str = None, url: str = None) Collection[source]
marked_for_delete: bool | None
metadata: None
name: str
owner: str
perspective(perspective_id) Perspective[source]
perspectives(q: str | None = None, fields: List[str] | None = None, **url_params) PerspectiveIterator[Perspective][source]
segment(segment_id) Segment[source]
segments(q: str | None = None, fields: List[str] | None = None, **url_params) SegmentIterator[Segment][source]
slug: str
updated_at: datetime.datetime
updated_by: str
upload_csv(file, content_columns: List[str], delimiter=',', rename: str | None = None, **user_data)[source]
Args:

file: source file to upload content_columns: Used for multi-document upload via CSV. A list which specifies the column(s)

that will be used for the document body.

delimiter: delimiter for csv, defaults to “,” rename: a new name if desired **user_data: any additional user data

Returns:

None

upload_file(file: Path | BufferedReader, rename: str | None = None, save_origin: bool | None = False, **user_data) Document[source]

Convenience method to upload a filelike into RedShred.

Args:

collection_link (str): Target collection to upload file to file (str, filelike): Either a filename, url of file, or open() filelike object rename (str, optional): File name override. Defaults to existing filename save_origin (bool, optional): Save the path to the file on disk. Defaults to False user_data (dict): arbitrary dictionary to store with document on server

Raises:

ValueError: Name argument missing for URL upload

Returns:

dict: Returned payload from API server

upload_text(text: str, name: str, **user_data) Document[source]

Convenience method to upload raw text into RedShred.

Given a collection name and a url, upload that text into RedShred.

Args:

text (str): Text to upload. name (str, optional): File name to save text as. user_data (dict): arbitrary dictionary to store with document on server

Returns:

dict: Returned payload from API server

upload_url(url: str, rename: str | None = None, save_origin: bool | None = True, **user_data) Document[source]

Convenience method to upload a URL into RedShred.

Given a collection name and a url, upload that file into RedShred.

Args:

collection_link (str): Target collection to upload file to url (str, filelike): Url of file to upload. rename (str, optional): File name override. Defaults to existing filename save_origin (bool, optional): Save the url to the file. Defaults to True user_data (dict): arbitrary dictionary to store with document on server

Raises:

ValueError: Name argument missing for URL upload

Returns:

dict: Returned payload from API server

user_data: dict | None
class redshred.models.api.CollectionIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None
path: str | None
query: str
class redshred.models.api.Document(*, self_link: str = None, id: str = None, collection_link: str = None, collection_slug: str = None, config: CollectionConfiguration | None = None, content_hash: str = None, created_at: datetime = None, created_by: str = None, csv_metadata: dict | None = None, description: str | None = None, document_segment_link: str = None, errors: dict | str | None = None, file_link: str = None, file_size: int = None, index: int = None, metadata: dict | None = None, n_pages: int = None, name: str = None, original_name: str = None, pages_link: str = None, pdf_link: str = None, perspectives_link: str = None, read_state: str = None, read_state_updated_at: datetime = None, region: GeoJSON = None, segments_link: str = None, slug: str = None, source: str = None, summary: str = None, text: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, warnings: dict | None = None, uniqueness_id: str | None = None)[source]

Bases: ApiObject

collection() Collection[source]
collection_slug: str
config: CollectionConfiguration | None
content_hash: str
create(*args, **kwargs)[source]
created_at: datetime.datetime
created_by: str
csv_metadata: dict | None
description: str | None
download(path: str | 'pathlib.Path') int[source]

Download the original_file uploaded to RedShred to the specified path, returning the total bytes written

Args:

path: a path to somewhere on the local filesystem

Returns: number of bytes written

download_bytes() bytes[source]

Download the original_file uploaded to RedShred to the specified path, returning the total bytes written

Returns: document as bytes

errors: dict | str | None
file_size: int
id: str
index: int
metadata: dict | None
n_pages: int
name: str
original_name: str
page(index) Page[source]
pages(q: str | None = None, fields: List[str] | None = None, **url_params) PageIterator[Page][source]
perspective(perspective_id) Perspective[source]
perspectives(q: str | None = None, fields: List[str] | None = None, **url_params) PerspectiveIterator[Perspective][source]
read_state: str
read_state_updated_at: datetime.datetime
region: GeoJSON
reread_document(force=False)[source]
segment(segment_id) Perspective[source]
segments(q: str | None = None, fields: List[str] | None = None, **url_params) SegmentIterator[Segment][source]
slug: str
source: str
summary: str
text: str
uniqueness_id: str | None
updated_at: datetime.datetime
updated_by: str
user_data: dict | None
wait_until_read(wait_time_seconds: int = 5)[source]
warnings: dict | None
class redshred.models.api.DocumentIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None
path: str | None
query: str
class redshred.models.api.Page(*, self_link: str = None, collection_link: str = None, collection_slug: str = None, content_hash: str = None, created_at: datetime = None, created_by: str = None, document_index: int = None, document_name: str = None, dpi: int = None, height: float = None, id: str = None, index: int = None, metadata: dict | None = None, name: str = None, page_segment_link: str = None, perspectives_link: str = None, region: GeoJSON = None, segments_link: str = None, summary: str = None, text: str = None, tokens_file_link: str = None, units: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, width: float = None)[source]

Bases: ApiObject

collection()[source]
collection_slug: str
content_hash: str
created_at: datetime.datetime
created_by: str
document()[source]
document_index: int
document_name: str
dpi: int
height: float
id: str
index: int
metadata: dict | None
name: str
next()[source]
perspective(perspective_id) Perspective[source]
perspectives(q: str | None = None, fields: List[str] | None = None, **url_params) PerspectiveIterator[Perspective][source]
previous()[source]
region: GeoJSON
segment(segment_id) Segment[source]
segments(q: str | None = None, fields: List[str] | None = None, **url_params) SegmentIterator[Segment][source]
summary: str
text: str
tokens()[source]
units: str
updated_at: datetime.datetime
updated_by: str
user_data: dict | None
width: float
class redshred.models.api.PageIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None
path: str | None
query: str
class redshred.models.api.Perspective(*, self_link: str = None, name: str = None, enrichment_name: str = None, collection_link: str = None, collection_slug: str = None, created_at: datetime = None, created_by: str = None, document_link: str = None, description: str | None = None, document_name: str = None, enrichment_config: dict = None, errors: dict | None = None, id: str = None, metadata: dict | None = None, segment_types: list = None, segments_link: str = None, slug: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, warnings: dict | None = None, cache_id: str | None = None)[source]

Bases: ApiObject

bulk_create_segments(segments: List[dict | Segment], batch_size=128)[source]
cache_id: str | None
collection()[source]
collection_slug: str
create(collection: str | Collection | None = None, document: Document | None = None, client: 'redshred.api.http.RedShredAPI' | 'redshred.api.client.RedShredClient' | None = None)[source]

Create the local object on the remote server

created_at: datetime.datetime
created_by: str
description: str | None
document()[source]
document_name: str
enrichment_config: dict
enrichment_name: str
errors: dict | None
id: str
metadata: dict | None
name: str
segment(segment_id) Segment[source]
segment_types: list
segments(q: str | None = None, fields: List[str] | None = None, **url_params) SegmentIterator[Segment][source]
slug: str
updated_at: datetime.datetime
updated_by: str
user_data: dict | None
warnings: dict | None
class redshred.models.api.PerspectiveIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None
path: str | None
query: str
class redshred.models.api.RedShredUser(*, active: bool, email: str, first_name: str, joined: datetime, last_login: datetime, last_name: str, staff: bool, super: bool, username: str, token: str)[source]

Bases: SerializableModel

active: bool
email: str
first_name: str
joined: datetime
last_login: datetime
last_name: str
staff: bool
super: bool
token: str
username: str
class redshred.models.api.Segment(*, self_link: str = None, segment_type: str = None, regions: GeoJSON = None, bounding_box: BoundingBox | None = None, collection_link: str = None, collection_slug: str = None, created_at: datetime = None, created_by: str = None, document_link: str = None, document_name: str = None, enrichment_data: dict | None = None, enrichment_name: str = None, errors: dict | None = None, id: str = None, labels: list = None, metadata: dict | None = None, max_x: float | None = None, max_y: float | None = None, min_x: float | None = None, min_y: float | None = None, perspective_link: str = None, summary: str = None, text: str = None, updated_at: datetime = None, updated_by: str = None, user_data: dict | None = None, warnings: dict | None = None, cache_id: str | None = None)[source]

Bases: ApiObject

between(segment: Segment, strict: bool = False) BoundingBox[source]

Provides a helper function to generate the bounding box between two segments.

Args:

segment (Segment): A segment to define the space strict (bool, optional): If True, returned bounding box will be area exatly between the two segments. If False, the bounding box returned will be the entire page width between two segments. Defaults to False.

Returns:

list: Bounding box of the area between two segments.

bounding_box: BoundingBox | None
cache_id: str | None
collection()[source]
collection_slug: str
create(perspective: Perspective | None = None)[source]

Create the local object on the remote server

created_at: datetime.datetime
created_by: str
document()[source]
document_name: str
enrichment_data: dict | None
enrichment_name: str
errors: dict | None
get_segment_image(path_to_save_folder: str | None = None, return_bytes=False, inline=False, **url_params) bytes | str[source]
get_segments_from_perspective(perspective_name: str, **params)[source]
get_text(**url_params)[source]
id: str
labels: list
max_x: float | None
max_y: float | None
metadata: dict | None
min_x: float | None
min_y: float | None
perspective()[source]
q(query: str, search_type: Literal['documents', 'pages', 'perspectives', 'segments'] = 'segments', **url_params)[source]
regions: GeoJSON
segment_type: str
summary: str
text: str
updated_at: datetime.datetime
updated_by: str
user_data: dict | None
warnings: dict | None
class redshred.models.api.SegmentIterator(data: Iterable[ApiObject] = None, client: 'redshred.api.client.RedShredClient' | 'redshred.api.http.RedShredAPI' = None, path: str = None, q: str = None, fields: List[str] | Set[str] | str | None = None, **client_params)[source]

Bases: APIObjectIterator

count: int | None
path: str | None
query: str
class redshred.models.api.SerializableModel[source]

Bases: BaseModel

class Config[source]

Bases: object

arbitrary_types_allowed = True
extra = 'forbid'
json_encoders = {<class 'datetime.datetime'>: <function SerializableModel.Config.<lambda>>, <class 'redshred.spatial.GeoJSON'>: <class 'dict'>, <class 'redshred.spatial.BoundingBox'>: <class 'dict'>}
underscore_attrs_are_private = True
use_enum_values = True
classmethod dict_to_geojson(v)[source]
classmethod list_to_bbox(v)[source]
classmethod lists_to_bboxes(v)[source]
yaml(include=None, exclude=None, indent=None)[source]
class redshred.models.api.Token(*, index: int, text: str, text_with_ws: str | None = None, bboxes: List[BoundingBox], regions: GeoJSON, metadata: dict | None = None, rotation: int = 0)[source]

Bases: SerializableModel

class Config[source]

Bases: Config

extra = 'ignore'
bboxes: List[BoundingBox]
index: int
metadata: dict | None
regions: GeoJSON
rotation: int
text: str
text_with_ws: str | None
redshred.models.api.get_type(name)[source]

redshred.models.configuration module

class redshred.models.configuration.ChoiceSegmenter(value)[source]

Bases: str, Enum

An enumeration.

pdfminer = 'pdfminer'
pdftotext = 'pdftotext'
tesseract = 'tesseract'
class redshred.models.configuration.ChoiceSentenceTokenizer(value)[source]

Bases: str, Enum

An enumeration.

nltk = 'nltk'
spacy = 'spacy'
class redshred.models.configuration.CollectionConfiguration(*, tokenizer: List[Tokenizers | str] | Tokenizers | str = None, enrichments: List[DefinedAcronymsPerspective | ExternalAPIPerspective | GrouperPerspective | HuggingfacePerspective | IrisPerspective | PageImagesPerspective | PdftotextPerspective | PreprocessPerspective | RegexPerspective | SentencesPerspective | SpacyPerspective | TFIDFPerspective | TypographyPerspective | PerspectiveConfiguration] = None, notifications: List[NotificationConfiguration] = None, document_uniqueness: DocumentUniqueness = 'contents', allow_anonymous_downloads: bool = False)[source]

Bases: BaseModel

Config

alias of _DefaultPydanticConfig

allow_anonymous_downloads: bool
dict(*, include: AbstractSetIntStr | MappingIntStrAny | None = None, exclude: AbstractSetIntStr | MappingIntStrAny | None = None, by_alias: bool = False, skip_defaults: bool | None = None, exclude_unset: bool = False, exclude_defaults: bool = False, exclude_none: bool = False) DictStrAny

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

document_uniqueness: DocumentUniqueness
enrichments: List[DefinedAcronymsPerspective | ExternalAPIPerspective | GrouperPerspective | HuggingfacePerspective | IrisPerspective | PageImagesPerspective | PdftotextPerspective | PreprocessPerspective | RegexPerspective | SentencesPerspective | SpacyPerspective | TFIDFPerspective | TypographyPerspective | PerspectiveConfiguration]
classmethod from_dict(config: Dict[str, Any])[source]
json(*, include: AbstractSetIntStr | MappingIntStrAny | None = None, exclude: AbstractSetIntStr | MappingIntStrAny | None = None, by_alias: bool = False, skip_defaults: bool | None = None, exclude_unset: bool = False, exclude_defaults: bool = False, exclude_none: bool = False, encoder: Callable[[Any], Any] | None = None, models_as_dict: bool = True, **dumps_kwargs: Any) unicode

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

notifications: List[NotificationConfiguration]
tokenizer: List[Tokenizers | str] | Tokenizers | str
validate_remote_schema(client: redshred.api.client.RedShredClient)[source]
yaml(*, include: Set[str] | None = None, exclude: Set[str] | None = None, by_alias: bool = False, skip_defaults: bool | None = None, exclude_unset: bool = True, exclude_defaults: bool = False, exclude_none: bool = False, encoder: Callable[[Any], Any] | None = None, models_as_dict: bool = True, **dumps_kwargs: Any)[source]

Generate a YAML representation of the model from the JSON representation, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

class redshred.models.configuration.CustomSegmenter(*, name: ChoiceSegmenter, options: Dict[str, Any] = None)[source]

Bases: BaseModel

Config

alias of _DefaultPydanticConfig

name: ChoiceSegmenter
options: Dict[str, Any]
class redshred.models.configuration.DocumentUniqueness(value)[source]

Bases: str, Enum

An enumeration.

always = 'always'
contents = 'contents'
filename = 'filename'
class redshred.models.configuration.NotificationConfiguration(*, label: ConstrainedStrValue, recipients: ConstrainedListValue[str], query: str, condition: str = 'length(@) > `0`')[source]

Bases: BaseModel

Config

alias of _DefaultPydanticConfig

condition: str
label: str
query: str
recipients: List[str]
class redshred.models.configuration.PerspectiveConfiguration(*, name: str, perspective: str, segments: PerspectiveSegmentQuery | Dict = None, description: str = '', config: Dict[str, Any] = None, debug: bool = False)[source]

Bases: BaseModel

Config

alias of _DefaultPydanticConfig

config: Dict[str, Any]
debug: bool
description: str
name: str
perspective: str
segments: PerspectiveSegmentQuery | Dict
class redshred.models.configuration.PerspectiveSegmentQuery(*, queries: List[str] | Dict[str, Any] = None, prerequisites: List[str] = None)[source]

Bases: BaseModel

Config

alias of _DefaultPydanticConfig

prerequisites: List[str]
queries: List[str] | Dict[str, Any]
class redshred.models.configuration.Tokenizers(value)[source]

Bases: str, Enum

An enumeration.

pdfminer = 'pdfminer'
pdftotext = 'pdftotext'
tesseract = 'tesseract'
tet = 'tet'
class redshred.models.configuration.TypographyConfiguration(*, debug: bool = False, segmenter: ChoiceSegmenter | CustomSegmenter = 'pdfminer', sentence_tokenizer: ChoiceSentenceTokenizer | None = None)[source]

Bases: BaseModel

Config

alias of _DefaultPydanticConfig

debug: bool
segmenter: ChoiceSegmenter | CustomSegmenter
sentence_tokenizer: ChoiceSentenceTokenizer | None

redshred.models.enrichments module

Module contents