Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/database.py: 28%

774 this would usually be a `Personal Access Token <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_.

775 If the repository is private, it's necessary for the token to have ``repo`` scope

776 in the case of a classic token, or ``actions:read`` in the case of a fine-grained token.

777

778

779 In most cases, this will be used

780 through the :class:`~hypothesis.database.MultiplexedDatabase`,

781 by combining a local directory-based database with this one. For example:

782

783 .. code-block:: python

784

785 local = DirectoryBasedExampleDatabase(".hypothesis/examples")

786 shared = ReadOnlyDatabase(GitHubArtifactDatabase("user", "repo"))

787

788 settings.register_profile("ci", database=local)

789 settings.register_profile("dev", database=MultiplexedDatabase(local, shared))

790 # We don't want to use the shared database in CI, only to populate its local one.

791 # which the workflow should then upload as an artifact.

792 settings.load_profile("ci" if os.environ.get("CI") else "dev")

793

794 .. note::

795 Because this database is read-only, you always need to wrap it with the

796 :class:`ReadOnlyDatabase`.

797

798 A setup like this can be paired with a GitHub Actions workflow including

799 something like the following:

800

801 .. code-block:: yaml

802

803 - name: Download example database

804 uses: dawidd6/action-download-artifact@v9

805 with:

806 name: hypothesis-example-db

807 path: .hypothesis/examples

808 if_no_artifact_found: warn

809 workflow_conclusion: completed

810

811 - name: Run tests

812 run: pytest

813

814 - name: Upload example database

815 uses: actions/upload-artifact@v3

816 if: always()

817 with:

818 name: hypothesis-example-db

819 path: .hypothesis/examples

820

821 In this workflow, we use `dawidd6/action-download-artifact <https://github.com/dawidd6/action-download-artifact>`_

822 to download the latest artifact given that the official `actions/download-artifact <https://github.com/actions/download-artifact>`_

823 does not support downloading artifacts from previous workflow runs.

824

825 The database automatically implements a simple file-based cache with a default expiration period

826 of 1 day. You can adjust this through the ``cache_timeout`` property.

827

828 For mono-repo support, you can provide a unique ``artifact_name`` (e.g. ``hypofuzz-example-db-frontend``).

829 """

830

831 def __init__(

832 self,

833 owner: str,

834 repo: str,

835 artifact_name: str = "hypothesis-example-db",

836 cache_timeout: timedelta = timedelta(days=1),

837 path: StrPathT | None = None,

838 ):

839 super().__init__()

840 self.owner = owner

841 self.repo = repo

842 self.artifact_name = artifact_name

843 self.cache_timeout = cache_timeout

844

845 # Get the GitHub token from the environment

846 # It's unnecessary to use a token if the repo is public

847 self.token: str | None = getenv("GITHUB_TOKEN")

848

849 if path is None:

850 self.path: Path = Path(

851 storage_directory(f"github-artifacts/{self.artifact_name}/")

852 )

853 else:

854 self.path = Path(path)

855

856 # We don't want to initialize the cache until we need to

857 self._initialized: bool = False

858 self._disabled: bool = False

859

860 # This is the path to the artifact in usage

861 # .hypothesis/github-artifacts/<artifact-name>/<modified_isoformat>.zip

862 self._artifact: Path | None = None

863 # This caches the artifact structure

864 self._access_cache: dict[PurePath, set[PurePath]] | None = None

865

866 # Message to display if user doesn't wrap around ReadOnlyDatabase

867 self._read_only_message = (

868 "This database is read-only. "

869 "Please wrap this class with ReadOnlyDatabase"

870 "i.e. ReadOnlyDatabase(GitHubArtifactDatabase(...))."

871 )

872

873 def __repr__(self) -> str:

874 return (

875 f"GitHubArtifactDatabase(owner={self.owner!r}, "

876 f"repo={self.repo!r}, artifact_name={self.artifact_name!r})"

877 )

878

879 def __eq__(self, other: object) -> bool:

880 return (

881 isinstance(other, GitHubArtifactDatabase)

882 and self.owner == other.owner

883 and self.repo == other.repo

884 and self.artifact_name == other.artifact_name

885 and self.path == other.path

886 )

887

888 def _prepare_for_io(self) -> None:

889 assert self._artifact is not None, "Artifact not loaded."

890

891 if self._initialized: # pragma: no cover

892 return

893

894 # Test that the artifact is valid

895 try:

896 with ZipFile(self._artifact) as f:

897 if f.testzip(): # pragma: no cover

898 raise BadZipFile

899

900 # Turns out that testzip() doesn't work quite well

901 # doing the cache initialization here instead

902 # will give us more coverage of the artifact.

903

904 # Cache the files inside each keypath

905 self._access_cache = {}

906 with ZipFile(self._artifact) as zf:

907 namelist = zf.namelist()

908 # Iterate over files in the artifact

909 for filename in namelist:

910 fileinfo = zf.getinfo(filename)

911 if fileinfo.is_dir():

912 self._access_cache[PurePath(filename)] = set()

913 else:

914 # Get the keypath from the filename

915 keypath = PurePath(filename).parent

916 # Add the file to the keypath

917 self._access_cache[keypath].add(PurePath(filename))

918 except BadZipFile:

919 warnings.warn(

920 "The downloaded artifact from GitHub is invalid. "

921 "This could be because the artifact was corrupted, "

922 "or because the artifact was not created by Hypothesis. ",

923 HypothesisWarning,

924 stacklevel=3,

925 )

926 self._disabled = True

927

928 self._initialized = True

929

930 def _initialize_db(self) -> None:

931 # Trigger warning that we suppressed earlier by intent_to_write=False

932 storage_directory(self.path.name)

933 # Create the cache directory if it doesn't exist

934 self.path.mkdir(exist_ok=True, parents=True)

935

936 # Get all artifacts

937 cached_artifacts = sorted(

938 self.path.glob("*.zip"),

939 key=lambda a: datetime.fromisoformat(a.stem.replace("_", ":")),

940 )

941

942 # Remove all but the latest artifact

943 for artifact in cached_artifacts[:-1]:

944 artifact.unlink()

945

946 try:

947 found_artifact = cached_artifacts[-1]

948 except IndexError:

949 found_artifact = None

950

951 # Check if the latest artifact is a cache hit

952 if found_artifact is not None and (

953 datetime.now(timezone.utc)

954 - datetime.fromisoformat(found_artifact.stem.replace("_", ":"))

955 < self.cache_timeout

956 ):

957 self._artifact = found_artifact

958 else:

959 # Download the latest artifact from GitHub

960 new_artifact = self._fetch_artifact()

961

962 if new_artifact:

963 if found_artifact is not None:

964 found_artifact.unlink()

965 self._artifact = new_artifact

966 elif found_artifact is not None:

967 warnings.warn(

968 "Using an expired artifact as a fallback for the database: "

969 f"{found_artifact}",

970 HypothesisWarning,

971 stacklevel=2,

972 )

973 self._artifact = found_artifact

974 else:

975 warnings.warn(

976 "Couldn't acquire a new or existing artifact. Disabling database.",

977 HypothesisWarning,

978 stacklevel=2,

979 )

980 self._disabled = True

981 return

982

983 self._prepare_for_io()

984

985 def _get_bytes(self, url: str) -> bytes | None: # pragma: no cover

986 request = Request(

987 url,

988 headers={

989 "Accept": "application/vnd.github+json",

990 "X-GitHub-Api-Version": "2022-11-28 ",

991 "Authorization": f"Bearer {self.token}",

992 },

993 )

994 warning_message = None

995 response_bytes: bytes | None = None

996 try:

997 with urlopen(request) as response:

998 response_bytes = response.read()

999 except HTTPError as e:

1000 if e.code == 401:

1001 warning_message = (

1002 "Authorization failed when trying to download artifact from GitHub. "

1003 "Check that you have a valid GITHUB_TOKEN set in your environment."

1004 )

1005 else:

1006 warning_message = (

1007 "Could not get the latest artifact from GitHub. "

1008 "This could be because because the repository "

1009 "or artifact does not exist. "

1010 )

1011 # see https://github.com/python/cpython/issues/128734

1012 e.close()

1013 except URLError:

1014 warning_message = "Could not connect to GitHub to get the latest artifact. "

1015 except TimeoutError:

1016 warning_message = (

1017 "Could not connect to GitHub to get the latest artifact "

1018 "(connection timed out)."

1019 )

1020

1021 if warning_message is not None:

1022 warnings.warn(warning_message, HypothesisWarning, stacklevel=4)

1023 return None

1024

1025 return response_bytes

1026

1027 def _fetch_artifact(self) -> Path | None: # pragma: no cover

1028 # Get the list of artifacts from GitHub

1029 url = f"https://api.github.com/repos/{self.owner}/{self.repo}/actions/artifacts"

1030 response_bytes = self._get_bytes(url)

1031 if response_bytes is None:

1032 return None

1033

1034 artifacts = json.loads(response_bytes)["artifacts"]

1035 artifacts = [a for a in artifacts if a["name"] == self.artifact_name]

1036

1037 if not artifacts:

1038 return None

1039

1040 # Get the latest artifact from the list

1041 artifact = max(artifacts, key=lambda a: a["created_at"])

1042 url = artifact["archive_download_url"]

1043

1044 # Download the artifact

1045 artifact_bytes = self._get_bytes(url)

1046 if artifact_bytes is None:

1047 return None

1048

1049 # Save the artifact to the cache

1050 # We replace ":" with "_" to ensure the filenames are compatible

1051 # with Windows filesystems

1052 timestamp = datetime.now(timezone.utc).isoformat().replace(":", "_")

1053 artifact_path = self.path / f"{timestamp}.zip"

1054 try:

1055 artifact_path.write_bytes(artifact_bytes)

1056 except OSError:

1057 warnings.warn(

1058 "Could not save the latest artifact from GitHub. ",

1059 HypothesisWarning,

1060 stacklevel=3,

1061 )

1062 return None

1063

1064 return artifact_path

1065

1066 @staticmethod

1067 @lru_cache

1068 def _key_path(key: bytes) -> PurePath:

1069 return PurePath(_hash(key) + "/")

1070

1071 def fetch(self, key: bytes) -> Iterable[bytes]:

1072 if self._disabled:

1073 return

1074

1075 if not self._initialized:

1076 self._initialize_db()

1077 if self._disabled:

1078 return

1079

1080 assert self._artifact is not None

1081 assert self._access_cache is not None

1082

1083 kp = self._key_path(key)

1084

1085 with ZipFile(self._artifact) as zf:

1086 # Get the all files in the the kp from the cache

1087 filenames = self._access_cache.get(kp, ())

1088 for filename in filenames:

1089 with zf.open(filename.as_posix()) as f:

1090 yield f.read()

1091

1092 # Read-only interface

1093 def save(self, key: bytes, value: bytes) -> None:

1094 raise RuntimeError(self._read_only_message)

1095

1096 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1097 raise RuntimeError(self._read_only_message)

1098

1099 def delete(self, key: bytes, value: bytes) -> None:

1100 raise RuntimeError(self._read_only_message)

1101

1102

1103class BackgroundWriteDatabase(ExampleDatabase):

1104 """A wrapper which defers writes on the given database to a background thread.

1105

1106 Calls to :meth:`~hypothesis.database.ExampleDatabase.fetch` wait for any

1107 enqueued writes to finish before fetching from the database.

1108 """

1109

1110 def __init__(self, db: ExampleDatabase) -> None:

1111 super().__init__()

1112 self._db = db

1113 self._queue: Queue[tuple[str, tuple[bytes, ...]]] = Queue()

1114 self._thread: Thread | None = None

1115

1116 def _ensure_thread(self):

1117 if self._thread is None:

1118 self._thread = Thread(target=self._worker, daemon=True)

1119 self._thread.start()

1120 # avoid an unbounded timeout during gc. 0.1 should be plenty for most

1121 # use cases.

1122 weakref.finalize(self, self._join, 0.1)

1123

1124 def __repr__(self) -> str:

1125 return f"BackgroundWriteDatabase({self._db!r})"

1126

1127 def __eq__(self, other: object) -> bool:

1128 return isinstance(other, BackgroundWriteDatabase) and self._db == other._db

1129

1130 def _worker(self) -> None:

1131 while True:

1132 method, args = self._queue.get()

1133 getattr(self._db, method)(*args)

1134 self._queue.task_done()

1135

1136 def _join(self, timeout: float | None = None) -> None:

1137 # copy of Queue.join with a timeout. https://bugs.python.org/issue9634

1138 with self._queue.all_tasks_done:

1139 while self._queue.unfinished_tasks:

1140 self._queue.all_tasks_done.wait(timeout)

1141

1142 def fetch(self, key: bytes) -> Iterable[bytes]:

1143 self._join()

1144 return self._db.fetch(key)

1145

1146 def save(self, key: bytes, value: bytes) -> None:

1147 self._ensure_thread()

1148 self._queue.put(("save", (key, value)))

1149

1150 def delete(self, key: bytes, value: bytes) -> None:

1151 self._ensure_thread()

1152 self._queue.put(("delete", (key, value)))

1153

1154 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1155 self._ensure_thread()

1156 self._queue.put(("move", (src, dest, value)))

1157

1158 def _start_listening(self) -> None:

1159 self._db.add_listener(self._broadcast_change)

1160

1161 def _stop_listening(self) -> None:

1162 self._db.remove_listener(self._broadcast_change)

1163

1164

1165def _pack_uleb128(value: int) -> bytes:

1166 """

1167 Serialize an integer into variable-length bytes. For each byte, the first 7

1168 bits represent (part of) the integer, while the last bit indicates whether the

1169 integer continues into the next byte.

1170

1171 https://en.wikipedia.org/wiki/LEB128

1172 """

1173 parts = bytearray()

1174 assert value >= 0

1175 while True:

1176 # chop off 7 bits

1177 byte = value & ((1 << 7) - 1)

1178 value >>= 7

1179 # set the continuation bit if we have more left

1180 if value:

1181 byte |= 1 << 7

1182

1183 parts.append(byte)

1184 if not value:

1185 break

1186 return bytes(parts)

1187

1188

1189def _unpack_uleb128(buffer: bytes) -> tuple[int, int]:

1190 """

1191 Inverts _pack_uleb128, and also returns the index at which at which we stopped

1192 reading.

1193 """

1194 value = 0

1195 for i, byte in enumerate(buffer):

1196 n = byte & ((1 << 7) - 1)

1197 value |= n << (i * 7)

1198

1199 if not byte >> 7:

1200 break

1201 return (i + 1, value)

1202

1203

1204def choices_to_bytes(choices: Iterable[ChoiceT], /) -> bytes:

1205 """Serialize a list of choices to a bytestring. Inverts choices_from_bytes."""

1206 # We use a custom serialization format for this, which might seem crazy - but our

1207 # data is a flat sequence of elements, and standard tools like protobuf or msgpack

1208 # don't deal well with e.g. nonstandard bit-pattern-NaNs, or invalid-utf8 unicode.

1209 #

1210 # We simply encode each element with a metadata byte, if needed a uint16 size, and

1211 # then the payload bytes. For booleans, the payload is inlined into the metadata.

1212 parts = []

1213 for choice in choices:

1214 if isinstance(choice, bool):

1215 # `000_0000v` - tag zero, low bit payload.

1216 parts.append(b"\1" if choice else b"\0")

1217 continue

1218

1219 # `tag_ssss [uint16 size?] [payload]`

1220 if isinstance(choice, float):

1221 tag = 1 << 5

1222 choice = struct.pack("!d", choice)

1223 elif isinstance(choice, int):

1224 tag = 2 << 5

1225 choice = choice.to_bytes(1 + choice.bit_length() // 8, "big", signed=True)

1226 elif isinstance(choice, bytes):

1227 tag = 3 << 5

1228 else:

1229 assert isinstance(choice, str)

1230 tag = 4 << 5

1231 choice = choice.encode(errors="surrogatepass")

1232

1233 size = len(choice)

1234 if size < 0b11111:

1235 parts.append((tag | size).to_bytes(1, "big"))

1236 else:

1237 parts.append((tag | 0b11111).to_bytes(1, "big"))

1238 parts.append(_pack_uleb128(size))

1239 parts.append(choice)

1240

1241 return b"".join(parts)

1242

1243

1244def _choices_from_bytes(buffer: bytes, /) -> tuple[ChoiceT, ...]:

1245 # See above for an explanation of the format.

1246 parts: list[ChoiceT] = []

1247 idx = 0

1248 while idx < len(buffer):

1249 tag = buffer[idx] >> 5

1250 size = buffer[idx] & 0b11111

1251 idx += 1

1252

1253 if tag == 0:

1254 parts.append(bool(size))

1255 continue

1256 if size == 0b11111:

1257 (offset, size) = _unpack_uleb128(buffer[idx:])

1258 idx += offset

1259 chunk = buffer[idx : idx + size]

1260 idx += size

1261

1262 if tag == 1:

1263 assert size == 8, "expected float64"

1264 parts.extend(struct.unpack("!d", chunk))

1265 elif tag == 2:

1266 parts.append(int.from_bytes(chunk, "big", signed=True))

1267 elif tag == 3:

1268 parts.append(chunk)

1269 else:

1270 assert tag == 4

1271 parts.append(chunk.decode(errors="surrogatepass"))

1272 return tuple(parts)

1273

1274

1275def choices_from_bytes(buffer: bytes, /) -> tuple[ChoiceT, ...] | None:

1276 """

1277 Deserialize a bytestring to a tuple of choices. Inverts choices_to_bytes.

1278

1279 Returns None if the given bytestring is not a valid serialization of choice

1280 sequences.

1281 """

1282 try:

1283 return _choices_from_bytes(buffer)

1284 except Exception:

1285 # deserialization error, eg because our format changed or someone put junk

1286 # data in the db.

1287 return None