Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/database.py: 35%

796 this would usually be a `Personal Access Token <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_.

797 If the repository is private, it's necessary for the token to have ``repo`` scope

798 in the case of a classic token, or ``actions:read`` in the case of a fine-grained token.

799

800

801 In most cases, this will be used

802 through the :class:`~hypothesis.database.MultiplexedDatabase`,

803 by combining a local directory-based database with this one. For example:

804

805 .. code-block:: python

806

807 local = DirectoryBasedExampleDatabase(".hypothesis/examples")

808 shared = ReadOnlyDatabase(GitHubArtifactDatabase("user", "repo"))

809

810 settings.register_profile("ci", database=local)

811 settings.register_profile("dev", database=MultiplexedDatabase(local, shared))

812 # We don't want to use the shared database in CI, only to populate its local one.

813 # which the workflow should then upload as an artifact.

814 settings.load_profile("ci" if os.environ.get("CI") else "dev")

815

816 .. note::

817 Because this database is read-only, you always need to wrap it with the

818 :class:`ReadOnlyDatabase`.

819

820 A setup like this can be paired with a GitHub Actions workflow including

821 something like the following:

822

823 .. code-block:: yaml

824

825 - name: Download example database

826 uses: dawidd6/action-download-artifact@v9

827 with:

828 name: hypothesis-example-db

829 path: .hypothesis/examples

830 if_no_artifact_found: warn

831 workflow_conclusion: completed

832

833 - name: Run tests

834 run: pytest

835

836 - name: Upload example database

837 uses: actions/upload-artifact@v3

838 if: always()

839 with:

840 name: hypothesis-example-db

841 path: .hypothesis/examples

842

843 In this workflow, we use `dawidd6/action-download-artifact <https://github.com/dawidd6/action-download-artifact>`_

844 to download the latest artifact given that the official `actions/download-artifact <https://github.com/actions/download-artifact>`_

845 does not support downloading artifacts from previous workflow runs.

846

847 The database automatically implements a simple file-based cache with a default expiration period

848 of 1 day. You can adjust this through the ``cache_timeout`` property.

849

850 For mono-repo support, you can provide a unique ``artifact_name`` (e.g. ``hypofuzz-example-db-frontend``).

851 """

852

853 def __init__(

854 self,

855 owner: str,

856 repo: str,

857 artifact_name: str = "hypothesis-example-db",

858 cache_timeout: timedelta = timedelta(days=1),

859 path: StrPathT | None = None,

860 ):

861 super().__init__()

862 self.owner = owner

863 self.repo = repo

864 self.artifact_name = artifact_name

865 self.cache_timeout = cache_timeout

866

867 # Get the GitHub token from the environment

868 # It's unnecessary to use a token if the repo is public

869 self.token: str | None = getenv("GITHUB_TOKEN")

870

871 if path is None:

872 self.path: Path = Path(

873 storage_directory(f"github-artifacts/{self.artifact_name}/")

874 )

875 else:

876 self.path = Path(path)

877

878 # We don't want to initialize the cache until we need to

879 self._initialized: bool = False

880 self._disabled: bool = False

881

882 # This is the path to the artifact in usage

883 # .hypothesis/github-artifacts/<artifact-name>/<modified_isoformat>.zip

884 self._artifact: Path | None = None

885 # This caches the artifact structure

886 self._access_cache: dict[PurePath, set[PurePath]] | None = None

887

888 # Message to display if user doesn't wrap around ReadOnlyDatabase

889 self._read_only_message = (

890 "This database is read-only. "

891 "Please wrap this class with ReadOnlyDatabase"

892 "i.e. ReadOnlyDatabase(GitHubArtifactDatabase(...))."

893 )

894

895 def __repr__(self) -> str:

896 return (

897 f"GitHubArtifactDatabase(owner={self.owner!r}, "

898 f"repo={self.repo!r}, artifact_name={self.artifact_name!r})"

899 )

900

901 def __eq__(self, other: object) -> bool:

902 return (

903 isinstance(other, GitHubArtifactDatabase)

904 and self.owner == other.owner

905 and self.repo == other.repo

906 and self.artifact_name == other.artifact_name

907 and self.path == other.path

908 )

909

910 def _prepare_for_io(self) -> None:

911 assert self._artifact is not None, "Artifact not loaded."

912

913 if self._initialized: # pragma: no cover

914 return

915

916 # Test that the artifact is valid

917 try:

918 with ZipFile(self._artifact) as f:

919 if f.testzip(): # pragma: no cover

920 raise BadZipFile

921

922 # Turns out that testzip() doesn't work quite well

923 # doing the cache initialization here instead

924 # will give us more coverage of the artifact.

925

926 # Cache the files inside each keypath

927 self._access_cache = {}

928 with ZipFile(self._artifact) as zf:

929 namelist = zf.namelist()

930 # Iterate over files in the artifact

931 for filename in namelist:

932 fileinfo = zf.getinfo(filename)

933 if fileinfo.is_dir():

934 self._access_cache[PurePath(filename)] = set()

935 else:

936 # Get the keypath from the filename

937 keypath = PurePath(filename).parent

938 # Add the file to the keypath

939 self._access_cache[keypath].add(PurePath(filename))

940 except BadZipFile:

941 warnings.warn(

942 "The downloaded artifact from GitHub is invalid. "

943 "This could be because the artifact was corrupted, "

944 "or because the artifact was not created by Hypothesis. ",

945 HypothesisWarning,

946 stacklevel=3,

947 )

948 self._disabled = True

949

950 self._initialized = True

951

952 def _initialize_db(self) -> None:

953 # Trigger warning that we suppressed earlier by intent_to_write=False

954 storage_directory(self.path.name)

955 # Create the cache directory if it doesn't exist

956 self.path.mkdir(exist_ok=True, parents=True)

957

958 # Get all artifacts

959 cached_artifacts = sorted(

960 self.path.glob("*.zip"),

961 key=lambda a: datetime.fromisoformat(a.stem.replace("_", ":")),

962 )

963

964 # Remove all but the latest artifact

965 for artifact in cached_artifacts[:-1]:

966 artifact.unlink()

967

968 try:

969 found_artifact = cached_artifacts[-1]

970 except IndexError:

971 found_artifact = None

972

973 # Check if the latest artifact is a cache hit

974 if found_artifact is not None and (

975 datetime.now(timezone.utc)

976 - datetime.fromisoformat(found_artifact.stem.replace("_", ":"))

977 < self.cache_timeout

978 ):

979 self._artifact = found_artifact

980 else:

981 # Download the latest artifact from GitHub

982 new_artifact = self._fetch_artifact()

983

984 if new_artifact:

985 if found_artifact is not None:

986 found_artifact.unlink()

987 self._artifact = new_artifact

988 elif found_artifact is not None:

989 warnings.warn(

990 "Using an expired artifact as a fallback for the database: "

991 f"{found_artifact}",

992 HypothesisWarning,

993 stacklevel=2,

994 )

995 self._artifact = found_artifact

996 else:

997 warnings.warn(

998 "Couldn't acquire a new or existing artifact. Disabling database.",

999 HypothesisWarning,

1000 stacklevel=2,

1001 )

1002 self._disabled = True

1003 return

1004

1005 self._prepare_for_io()

1006

1007 def _get_bytes(self, url: str) -> bytes | None: # pragma: no cover

1008 request = Request(

1009 url,

1010 headers={

1011 "Accept": "application/vnd.github+json",

1012 "X-GitHub-Api-Version": "2022-11-28 ",

1013 "Authorization": f"Bearer {self.token}",

1014 },

1015 )

1016 warning_message = None

1017 response_bytes: bytes | None = None

1018 try:

1019 with urlopen(request) as response:

1020 response_bytes = response.read()

1021 except HTTPError as e:

1022 if e.code == 401:

1023 warning_message = (

1024 "Authorization failed when trying to download artifact from GitHub. "

1025 "Check that you have a valid GITHUB_TOKEN set in your environment."

1026 )

1027 else:

1028 warning_message = (

1029 "Could not get the latest artifact from GitHub. "

1030 "This could be because the repository "

1031 "or artifact does not exist. "

1032 )

1033 # see https://github.com/python/cpython/issues/128734

1034 e.close()

1035 except URLError:

1036 warning_message = "Could not connect to GitHub to get the latest artifact. "

1037 except TimeoutError:

1038 warning_message = (

1039 "Could not connect to GitHub to get the latest artifact "

1040 "(connection timed out)."

1041 )

1042

1043 if warning_message is not None:

1044 warnings.warn(warning_message, HypothesisWarning, stacklevel=4)

1045 return None

1046

1047 return response_bytes

1048

1049 def _fetch_artifact(self) -> Path | None: # pragma: no cover

1050 # Get the list of artifacts from GitHub

1051 url = f"https://api.github.com/repos/{self.owner}/{self.repo}/actions/artifacts"

1052 response_bytes = self._get_bytes(url)

1053 if response_bytes is None:

1054 return None

1055

1056 artifacts = json.loads(response_bytes)["artifacts"]

1057 artifacts = [a for a in artifacts if a["name"] == self.artifact_name]

1058

1059 if not artifacts:

1060 return None

1061

1062 # Get the latest artifact from the list

1063 artifact = max(artifacts, key=lambda a: a["created_at"])

1064 url = artifact["archive_download_url"]

1065

1066 # Download the artifact

1067 artifact_bytes = self._get_bytes(url)

1068 if artifact_bytes is None:

1069 return None

1070

1071 # Save the artifact to the cache

1072 # We replace ":" with "_" to ensure the filenames are compatible

1073 # with Windows filesystems

1074 timestamp = datetime.now(timezone.utc).isoformat().replace(":", "_")

1075 artifact_path = self.path / f"{timestamp}.zip"

1076 try:

1077 artifact_path.write_bytes(artifact_bytes)

1078 except OSError:

1079 warnings.warn(

1080 "Could not save the latest artifact from GitHub. ",

1081 HypothesisWarning,

1082 stacklevel=3,

1083 )

1084 return None

1085

1086 return artifact_path

1087

1088 @staticmethod

1089 @lru_cache

1090 def _key_path(key: bytes) -> PurePath:

1091 return PurePath(_hash(key) + "/")

1092

1093 def fetch(self, key: bytes) -> Iterable[bytes]:

1094 if self._disabled:

1095 return

1096

1097 if not self._initialized:

1098 self._initialize_db()

1099 if self._disabled:

1100 return

1101

1102 assert self._artifact is not None

1103 assert self._access_cache is not None

1104

1105 kp = self._key_path(key)

1106

1107 with ZipFile(self._artifact) as zf:

1108 # Get all the files in the kp from the cache

1109 filenames = self._access_cache.get(kp, ())

1110 for filename in filenames:

1111 with zf.open(filename.as_posix()) as f:

1112 yield f.read()

1113

1114 # Read-only interface

1115 def save(self, key: bytes, value: bytes) -> None:

1116 raise RuntimeError(self._read_only_message)

1117

1118 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1119 raise RuntimeError(self._read_only_message)

1120

1121 def delete(self, key: bytes, value: bytes) -> None:

1122 raise RuntimeError(self._read_only_message)

1123

1124

1125class BackgroundWriteDatabase(ExampleDatabase):

1126 """A wrapper which defers writes on the given database to a background thread.

1127

1128 Calls to :meth:`~hypothesis.database.ExampleDatabase.fetch` wait for any

1129 enqueued writes to finish before fetching from the database.

1130 """

1131

1132 def __init__(self, db: ExampleDatabase) -> None:

1133 super().__init__()

1134 self._db = db

1135 self._queue: Queue[tuple[str, tuple[bytes, ...]]] = Queue()

1136 self._thread: Thread | None = None

1137

1138 def _ensure_thread(self):

1139 if self._thread is None:

1140 self._thread = Thread(target=self._worker, daemon=True)

1141 self._thread.start()

1142 # avoid an unbounded timeout during gc. 0.1 should be plenty for most

1143 # use cases.

1144 weakref.finalize(self, self._join, 0.1)

1145

1146 def __repr__(self) -> str:

1147 return f"BackgroundWriteDatabase({self._db!r})"

1148

1149 def __eq__(self, other: object) -> bool:

1150 return isinstance(other, BackgroundWriteDatabase) and self._db == other._db

1151

1152 def _worker(self) -> None:

1153 while True:

1154 method, args = self._queue.get()

1155 getattr(self._db, method)(*args)

1156 self._queue.task_done()

1157

1158 def _join(self, timeout: float | None = None) -> None:

1159 # copy of Queue.join with a timeout. https://bugs.python.org/issue9634

1160 with self._queue.all_tasks_done:

1161 while self._queue.unfinished_tasks:

1162 self._queue.all_tasks_done.wait(timeout)

1163

1164 def fetch(self, key: bytes) -> Iterable[bytes]:

1165 self._join()

1166 return self._db.fetch(key)

1167

1168 def save(self, key: bytes, value: bytes) -> None:

1169 self._ensure_thread()

1170 self._queue.put(("save", (key, value)))

1171

1172 def delete(self, key: bytes, value: bytes) -> None:

1173 self._ensure_thread()

1174 self._queue.put(("delete", (key, value)))

1175

1176 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1177 self._ensure_thread()

1178 self._queue.put(("move", (src, dest, value)))

1179

1180 def _start_listening(self) -> None:

1181 self._db.add_listener(self._broadcast_change)

1182

1183 def _stop_listening(self) -> None:

1184 self._db.remove_listener(self._broadcast_change)

1185

1186

1187def _pack_uleb128(value: int) -> bytes:

1188 """

1189 Serialize an integer into variable-length bytes. For each byte, the first 7

1190 bits represent (part of) the integer, while the last bit indicates whether the

1191 integer continues into the next byte.

1192

1193 https://en.wikipedia.org/wiki/LEB128

1194 """

1195 parts = bytearray()

1196 assert value >= 0

1197 while True:

1198 # chop off 7 bits

1199 byte = value & ((1 << 7) - 1)

1200 value >>= 7

1201 # set the continuation bit if we have more left

1202 if value:

1203 byte |= 1 << 7

1204

1205 parts.append(byte)

1206 if not value:

1207 break

1208 return bytes(parts)

1209

1210

1211def _unpack_uleb128(buffer: bytes) -> tuple[int, int]:

1212 """

1213 Inverts _pack_uleb128, and also returns the index at which at which we stopped

1214 reading.

1215 """

1216 value = 0

1217 for i, byte in enumerate(buffer):

1218 n = byte & ((1 << 7) - 1)

1219 value |= n << (i * 7)

1220

1221 if not byte >> 7:

1222 break

1223 return (i + 1, value)

1224

1225

1226def choices_to_bytes(choices: Iterable[ChoiceT], /) -> bytes:

1227 """Serialize a list of choices to a bytestring. Inverts choices_from_bytes."""

1228 # We use a custom serialization format for this, which might seem crazy - but our

1229 # data is a flat sequence of elements, and standard tools like protobuf or msgpack

1230 # don't deal well with e.g. nonstandard bit-pattern-NaNs, or invalid-utf8 unicode.

1231 #

1232 # We simply encode each element with a metadata byte, if needed a uint16 size, and

1233 # then the payload bytes. For booleans, the payload is inlined into the metadata.

1234 parts = []

1235 for choice in choices:

1236 if isinstance(choice, bool):

1237 # `000_0000v` - tag zero, low bit payload.

1238 parts.append(b"\1" if choice else b"\0")

1239 continue

1240

1241 # `tag_ssss [uint16 size?] [payload]`

1242 if isinstance(choice, float):

1243 tag = 1 << 5

1244 choice = struct.pack("!d", choice)

1245 elif isinstance(choice, int):

1246 tag = 2 << 5

1247 choice = choice.to_bytes(1 + choice.bit_length() // 8, "big", signed=True)

1248 elif isinstance(choice, bytes):

1249 tag = 3 << 5

1250 else:

1251 assert isinstance(choice, str)

1252 tag = 4 << 5

1253 choice = choice.encode(errors="surrogatepass")

1254

1255 size = len(choice)

1256 if size < 0b11111:

1257 parts.append((tag | size).to_bytes(1, "big"))

1258 else:

1259 parts.append((tag | 0b11111).to_bytes(1, "big"))

1260 parts.append(_pack_uleb128(size))

1261 parts.append(choice)

1262

1263 return b"".join(parts)

1264

1265

1266def _choices_from_bytes(buffer: bytes, /) -> tuple[ChoiceT, ...]:

1267 # See above for an explanation of the format.

1268 parts: list[ChoiceT] = []

1269 idx = 0

1270 while idx < len(buffer):

1271 tag = buffer[idx] >> 5

1272 size = buffer[idx] & 0b11111

1273 idx += 1

1274

1275 if tag == 0:

1276 parts.append(bool(size))

1277 continue

1278 if size == 0b11111:

1279 offset, size = _unpack_uleb128(buffer[idx:])

1280 idx += offset

1281 chunk = buffer[idx : idx + size]

1282 idx += size

1283

1284 if tag == 1:

1285 assert size == 8, "expected float64"

1286 parts.extend(struct.unpack("!d", chunk))

1287 elif tag == 2:

1288 parts.append(int.from_bytes(chunk, "big", signed=True))

1289 elif tag == 3:

1290 parts.append(chunk)

1291 else:

1292 assert tag == 4

1293 parts.append(chunk.decode(errors="surrogatepass"))

1294 return tuple(parts)

1295

1296

1297def choices_from_bytes(buffer: bytes, /) -> tuple[ChoiceT, ...] | None:

1298 """

1299 Deserialize a bytestring to a tuple of choices. Inverts choices_to_bytes.

1300

1301 Returns None if the given bytestring is not a valid serialization of choice

1302 sequences.

1303 """

1304 try:

1305 return _choices_from_bytes(buffer)

1306 except Exception:

1307 # deserialization error, eg because our format changed or someone put junk

1308 # data in the db.

1309 return None