Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/hypothesis/database.py: 30%

742 this would usually be a `Personal Access Token <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_.

743 If the repository is private, it's necessary for the token to have ``repo`` scope

744 in the case of a classic token, or ``actions:read`` in the case of a fine-grained token.

745

746

747 In most cases, this will be used

748 through the :class:`~hypothesis.database.MultiplexedDatabase`,

749 by combining a local directory-based database with this one. For example:

750

751 .. code-block:: python

752

753 local = DirectoryBasedExampleDatabase(".hypothesis/examples")

754 shared = ReadOnlyDatabase(GitHubArtifactDatabase("user", "repo"))

755

756 settings.register_profile("ci", database=local)

757 settings.register_profile("dev", database=MultiplexedDatabase(local, shared))

758 # We don't want to use the shared database in CI, only to populate its local one.

759 # which the workflow should then upload as an artifact.

760 settings.load_profile("ci" if os.environ.get("CI") else "dev")

761

762 .. note::

763 Because this database is read-only, you always need to wrap it with the

764 :class:`ReadOnlyDatabase`.

765

766 A setup like this can be paired with a GitHub Actions workflow including

767 something like the following:

768

769 .. code-block:: yaml

770

771 - name: Download example database

772 uses: dawidd6/action-download-artifact@v9

773 with:

774 name: hypothesis-example-db

775 path: .hypothesis/examples

776 if_no_artifact_found: warn

777 workflow_conclusion: completed

778

779 - name: Run tests

780 run: pytest

781

782 - name: Upload example database

783 uses: actions/upload-artifact@v3

784 if: always()

785 with:

786 name: hypothesis-example-db

787 path: .hypothesis/examples

788

789 In this workflow, we use `dawidd6/action-download-artifact <https://github.com/dawidd6/action-download-artifact>`_

790 to download the latest artifact given that the official `actions/download-artifact <https://github.com/actions/download-artifact>`_

791 does not support downloading artifacts from previous workflow runs.

792

793 The database automatically implements a simple file-based cache with a default expiration period

794 of 1 day. You can adjust this through the ``cache_timeout`` property.

795

796 For mono-repo support, you can provide a unique ``artifact_name`` (e.g. ``hypofuzz-example-db-frontend``).

797 """

798

799 def __init__(

800 self,

801 owner: str,

802 repo: str,

803 artifact_name: str = "hypothesis-example-db",

804 cache_timeout: timedelta = timedelta(days=1),

805 path: Optional[StrPathT] = None,

806 ):

807 super().__init__()

808 self.owner = owner

809 self.repo = repo

810 self.artifact_name = artifact_name

811 self.cache_timeout = cache_timeout

812

813 # Get the GitHub token from the environment

814 # It's unnecessary to use a token if the repo is public

815 self.token: Optional[str] = getenv("GITHUB_TOKEN")

816

817 if path is None:

818 self.path: Path = Path(

819 storage_directory(f"github-artifacts/{self.artifact_name}/")

820 )

821 else:

822 self.path = Path(path)

823

824 # We don't want to initialize the cache until we need to

825 self._initialized: bool = False

826 self._disabled: bool = False

827

828 # This is the path to the artifact in usage

829 # .hypothesis/github-artifacts/<artifact-name>/<modified_isoformat>.zip

830 self._artifact: Optional[Path] = None

831 # This caches the artifact structure

832 self._access_cache: Optional[dict[PurePath, set[PurePath]]] = None

833

834 # Message to display if user doesn't wrap around ReadOnlyDatabase

835 self._read_only_message = (

836 "This database is read-only. "

837 "Please wrap this class with ReadOnlyDatabase"

838 "i.e. ReadOnlyDatabase(GitHubArtifactDatabase(...))."

839 )

840

841 def __repr__(self) -> str:

842 return (

843 f"GitHubArtifactDatabase(owner={self.owner!r}, "

844 f"repo={self.repo!r}, artifact_name={self.artifact_name!r})"

845 )

846

847 def __eq__(self, other: object) -> bool:

848 return (

849 isinstance(other, GitHubArtifactDatabase)

850 and self.owner == other.owner

851 and self.repo == other.repo

852 and self.artifact_name == other.artifact_name

853 and self.path == other.path

854 )

855

856 def _prepare_for_io(self) -> None:

857 assert self._artifact is not None, "Artifact not loaded."

858

859 if self._initialized: # pragma: no cover

860 return

861

862 # Test that the artifact is valid

863 try:

864 with ZipFile(self._artifact) as f:

865 if f.testzip(): # pragma: no cover

866 raise BadZipFile

867

868 # Turns out that testzip() doesn't work quite well

869 # doing the cache initialization here instead

870 # will give us more coverage of the artifact.

871

872 # Cache the files inside each keypath

873 self._access_cache = {}

874 with ZipFile(self._artifact) as zf:

875 namelist = zf.namelist()

876 # Iterate over files in the artifact

877 for filename in namelist:

878 fileinfo = zf.getinfo(filename)

879 if fileinfo.is_dir():

880 self._access_cache[PurePath(filename)] = set()

881 else:

882 # Get the keypath from the filename

883 keypath = PurePath(filename).parent

884 # Add the file to the keypath

885 self._access_cache[keypath].add(PurePath(filename))

886 except BadZipFile:

887 warnings.warn(

888 "The downloaded artifact from GitHub is invalid. "

889 "This could be because the artifact was corrupted, "

890 "or because the artifact was not created by Hypothesis. ",

891 HypothesisWarning,

892 stacklevel=3,

893 )

894 self._disabled = True

895

896 self._initialized = True

897

898 def _initialize_db(self) -> None:

899 # Trigger warning that we suppressed earlier by intent_to_write=False

900 storage_directory(self.path.name)

901 # Create the cache directory if it doesn't exist

902 self.path.mkdir(exist_ok=True, parents=True)

903

904 # Get all artifacts

905 cached_artifacts = sorted(

906 self.path.glob("*.zip"),

907 key=lambda a: datetime.fromisoformat(a.stem.replace("_", ":")),

908 )

909

910 # Remove all but the latest artifact

911 for artifact in cached_artifacts[:-1]:

912 artifact.unlink()

913

914 try:

915 found_artifact = cached_artifacts[-1]

916 except IndexError:

917 found_artifact = None

918

919 # Check if the latest artifact is a cache hit

920 if found_artifact is not None and (

921 datetime.now(timezone.utc)

922 - datetime.fromisoformat(found_artifact.stem.replace("_", ":"))

923 < self.cache_timeout

924 ):

925 self._artifact = found_artifact

926 else:

927 # Download the latest artifact from GitHub

928 new_artifact = self._fetch_artifact()

929

930 if new_artifact:

931 if found_artifact is not None:

932 found_artifact.unlink()

933 self._artifact = new_artifact

934 elif found_artifact is not None:

935 warnings.warn(

936 "Using an expired artifact as a fallback for the database: "

937 f"{found_artifact}",

938 HypothesisWarning,

939 stacklevel=2,

940 )

941 self._artifact = found_artifact

942 else:

943 warnings.warn(

944 "Couldn't acquire a new or existing artifact. Disabling database.",

945 HypothesisWarning,

946 stacklevel=2,

947 )

948 self._disabled = True

949 return

950

951 self._prepare_for_io()

952

953 def _get_bytes(self, url: str) -> Optional[bytes]: # pragma: no cover

954 request = Request(

955 url,

956 headers={

957 "Accept": "application/vnd.github+json",

958 "X-GitHub-Api-Version": "2022-11-28 ",

959 "Authorization": f"Bearer {self.token}",

960 },

961 )

962 warning_message = None

963 response_bytes: Optional[bytes] = None

964 try:

965 with urlopen(request) as response:

966 response_bytes = response.read()

967 except HTTPError as e:

968 if e.code == 401:

969 warning_message = (

970 "Authorization failed when trying to download artifact from GitHub. "

971 "Check that you have a valid GITHUB_TOKEN set in your environment."

972 )

973 else:

974 warning_message = (

975 "Could not get the latest artifact from GitHub. "

976 "This could be because because the repository "

977 "or artifact does not exist. "

978 )

979 except URLError:

980 warning_message = "Could not connect to GitHub to get the latest artifact. "

981 except TimeoutError:

982 warning_message = (

983 "Could not connect to GitHub to get the latest artifact "

984 "(connection timed out)."

985 )

986

987 if warning_message is not None:

988 warnings.warn(warning_message, HypothesisWarning, stacklevel=4)

989 return None

990

991 return response_bytes

992

993 def _fetch_artifact(self) -> Optional[Path]: # pragma: no cover

994 # Get the list of artifacts from GitHub

995 url = f"https://api.github.com/repos/{self.owner}/{self.repo}/actions/artifacts"

996 response_bytes = self._get_bytes(url)

997 if response_bytes is None:

998 return None

999

1000 artifacts = json.loads(response_bytes)["artifacts"]

1001 artifacts = [a for a in artifacts if a["name"] == self.artifact_name]

1002

1003 if not artifacts:

1004 return None

1005

1006 # Get the latest artifact from the list

1007 artifact = max(artifacts, key=lambda a: a["created_at"])

1008 url = artifact["archive_download_url"]

1009

1010 # Download the artifact

1011 artifact_bytes = self._get_bytes(url)

1012 if artifact_bytes is None:

1013 return None

1014

1015 # Save the artifact to the cache

1016 # We replace ":" with "_" to ensure the filenames are compatible

1017 # with Windows filesystems

1018 timestamp = datetime.now(timezone.utc).isoformat().replace(":", "_")

1019 artifact_path = self.path / f"{timestamp}.zip"

1020 try:

1021 artifact_path.write_bytes(artifact_bytes)

1022 except OSError:

1023 warnings.warn(

1024 "Could not save the latest artifact from GitHub. ",

1025 HypothesisWarning,

1026 stacklevel=3,

1027 )

1028 return None

1029

1030 return artifact_path

1031

1032 @staticmethod

1033 @lru_cache

1034 def _key_path(key: bytes) -> PurePath:

1035 return PurePath(_hash(key) + "/")

1036

1037 def fetch(self, key: bytes) -> Iterable[bytes]:

1038 if self._disabled:

1039 return

1040

1041 if not self._initialized:

1042 self._initialize_db()

1043 if self._disabled:

1044 return

1045

1046 assert self._artifact is not None

1047 assert self._access_cache is not None

1048

1049 kp = self._key_path(key)

1050

1051 with ZipFile(self._artifact) as zf:

1052 # Get the all files in the the kp from the cache

1053 filenames = self._access_cache.get(kp, ())

1054 for filename in filenames:

1055 with zf.open(filename.as_posix()) as f:

1056 yield f.read()

1057

1058 # Read-only interface

1059 def save(self, key: bytes, value: bytes) -> None:

1060 raise RuntimeError(self._read_only_message)

1061

1062 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1063 raise RuntimeError(self._read_only_message)

1064

1065 def delete(self, key: bytes, value: bytes) -> None:

1066 raise RuntimeError(self._read_only_message)

1067

1068

1069class BackgroundWriteDatabase(ExampleDatabase):

1070 """A wrapper which defers writes on the given database to a background thread.

1071

1072 Calls to :meth:`~hypothesis.database.ExampleDatabase.fetch` wait for any

1073 enqueued writes to finish before fetching from the database.

1074 """

1075

1076 def __init__(self, db: ExampleDatabase) -> None:

1077 super().__init__()

1078 self._db = db

1079 self._queue: Queue[tuple[str, tuple[bytes, ...]]] = Queue()

1080 self._thread = Thread(target=self._worker, daemon=True)

1081 self._thread.start()

1082 # avoid an unbounded timeout during gc. 0.1 should be plenty for most

1083 # use cases.

1084 weakref.finalize(self, self._join, 0.1)

1085

1086 def __repr__(self) -> str:

1087 return f"BackgroundWriteDatabase({self._db!r})"

1088

1089 def __eq__(self, other: object) -> bool:

1090 return isinstance(other, BackgroundWriteDatabase) and self._db == other._db

1091

1092 def _worker(self) -> None:

1093 while True:

1094 method, args = self._queue.get()

1095 getattr(self._db, method)(*args)

1096 self._queue.task_done()

1097

1098 def _join(self, timeout: Optional[float] = None) -> None:

1099 # copy of Queue.join with a timeout. https://bugs.python.org/issue9634

1100 with self._queue.all_tasks_done:

1101 while self._queue.unfinished_tasks:

1102 self._queue.all_tasks_done.wait(timeout)

1103

1104 def fetch(self, key: bytes) -> Iterable[bytes]:

1105 self._join()

1106 return self._db.fetch(key)

1107

1108 def save(self, key: bytes, value: bytes) -> None:

1109 self._queue.put(("save", (key, value)))

1110

1111 def delete(self, key: bytes, value: bytes) -> None:

1112 self._queue.put(("delete", (key, value)))

1113

1114 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1115 self._queue.put(("move", (src, dest, value)))

1116

1117 def _start_listening(self) -> None:

1118 self._db.add_listener(self._broadcast_change)

1119

1120 def _stop_listening(self) -> None:

1121 self._db.remove_listener(self._broadcast_change)

1122

1123

1124def _pack_uleb128(value: int) -> bytes:

1125 """

1126 Serialize an integer into variable-length bytes. For each byte, the first 7

1127 bits represent (part of) the integer, while the last bit indicates whether the

1128 integer continues into the next byte.

1129

1130 https://en.wikipedia.org/wiki/LEB128

1131 """

1132 parts = bytearray()

1133 assert value >= 0

1134 while True:

1135 # chop off 7 bits

1136 byte = value & ((1 << 7) - 1)

1137 value >>= 7

1138 # set the continuation bit if we have more left

1139 if value:

1140 byte |= 1 << 7

1141

1142 parts.append(byte)

1143 if not value:

1144 break

1145 return bytes(parts)

1146

1147

1148def _unpack_uleb128(buffer: bytes) -> tuple[int, int]:

1149 """

1150 Inverts _pack_uleb128, and also returns the index at which at which we stopped

1151 reading.

1152 """

1153 value = 0

1154 for i, byte in enumerate(buffer):

1155 n = byte & ((1 << 7) - 1)

1156 value |= n << (i * 7)

1157

1158 if not byte >> 7:

1159 break

1160 return (i + 1, value)

1161

1162

1163def choices_to_bytes(choices: Iterable[ChoiceT], /) -> bytes:

1164 """Serialize a list of choices to a bytestring. Inverts choices_from_bytes."""

1165 # We use a custom serialization format for this, which might seem crazy - but our

1166 # data is a flat sequence of elements, and standard tools like protobuf or msgpack

1167 # don't deal well with e.g. nonstandard bit-pattern-NaNs, or invalid-utf8 unicode.

1168 #

1169 # We simply encode each element with a metadata byte, if needed a uint16 size, and

1170 # then the payload bytes. For booleans, the payload is inlined into the metadata.

1171 parts = []

1172 for choice in choices:

1173 if isinstance(choice, bool):

1174 # `000_0000v` - tag zero, low bit payload.

1175 parts.append(b"\1" if choice else b"\0")

1176 continue

1177

1178 # `tag_ssss [uint16 size?] [payload]`

1179 if isinstance(choice, float):

1180 tag = 1 << 5

1181 choice = struct.pack("!d", choice)

1182 elif isinstance(choice, int):

1183 tag = 2 << 5

1184 choice = choice.to_bytes(1 + choice.bit_length() // 8, "big", signed=True)

1185 elif isinstance(choice, bytes):

1186 tag = 3 << 5

1187 else:

1188 assert isinstance(choice, str)

1189 tag = 4 << 5

1190 choice = choice.encode(errors="surrogatepass")

1191

1192 size = len(choice)

1193 if size < 0b11111:

1194 parts.append((tag | size).to_bytes(1, "big"))

1195 else:

1196 parts.append((tag | 0b11111).to_bytes(1, "big"))

1197 parts.append(_pack_uleb128(size))

1198 parts.append(choice)

1199

1200 return b"".join(parts)

1201

1202

1203def _choices_from_bytes(buffer: bytes, /) -> tuple[ChoiceT, ...]:

1204 # See above for an explanation of the format.

1205 parts: list[ChoiceT] = []

1206 idx = 0

1207 while idx < len(buffer):

1208 tag = buffer[idx] >> 5

1209 size = buffer[idx] & 0b11111

1210 idx += 1

1211

1212 if tag == 0:

1213 parts.append(bool(size))

1214 continue

1215 if size == 0b11111:

1216 (offset, size) = _unpack_uleb128(buffer[idx:])

1217 idx += offset

1218 chunk = buffer[idx : idx + size]

1219 idx += size

1220

1221 if tag == 1:

1222 assert size == 8, "expected float64"

1223 parts.extend(struct.unpack("!d", chunk))

1224 elif tag == 2:

1225 parts.append(int.from_bytes(chunk, "big", signed=True))

1226 elif tag == 3:

1227 parts.append(chunk)

1228 else:

1229 assert tag == 4

1230 parts.append(chunk.decode(errors="surrogatepass"))

1231 return tuple(parts)

1232

1233

1234def choices_from_bytes(buffer: bytes, /) -> Optional[tuple[ChoiceT, ...]]:

1235 """

1236 Deserialize a bytestring to a tuple of choices. Inverts choices_to_bytes.

1237

1238 Returns None if the given bytestring is not a valid serialization of choice

1239 sequences.

1240 """

1241 try:

1242 return _choices_from_bytes(buffer)

1243 except Exception:

1244 # deserialization error, eg because our format changed or someone put junk

1245 # data in the db.

1246 return None