Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/database.py: 28%

782 this would usually be a `Personal Access Token <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_.

783 If the repository is private, it's necessary for the token to have ``repo`` scope

784 in the case of a classic token, or ``actions:read`` in the case of a fine-grained token.

785

786

787 In most cases, this will be used

788 through the :class:`~hypothesis.database.MultiplexedDatabase`,

789 by combining a local directory-based database with this one. For example:

790

791 .. code-block:: python

792

793 local = DirectoryBasedExampleDatabase(".hypothesis/examples")

794 shared = ReadOnlyDatabase(GitHubArtifactDatabase("user", "repo"))

795

796 settings.register_profile("ci", database=local)

797 settings.register_profile("dev", database=MultiplexedDatabase(local, shared))

798 # We don't want to use the shared database in CI, only to populate its local one.

799 # which the workflow should then upload as an artifact.

800 settings.load_profile("ci" if os.environ.get("CI") else "dev")

801

802 .. note::

803 Because this database is read-only, you always need to wrap it with the

804 :class:`ReadOnlyDatabase`.

805

806 A setup like this can be paired with a GitHub Actions workflow including

807 something like the following:

808

809 .. code-block:: yaml

810

811 - name: Download example database

812 uses: dawidd6/action-download-artifact@v9

813 with:

814 name: hypothesis-example-db

815 path: .hypothesis/examples

816 if_no_artifact_found: warn

817 workflow_conclusion: completed

818

819 - name: Run tests

820 run: pytest

821

822 - name: Upload example database

823 uses: actions/upload-artifact@v3

824 if: always()

825 with:

826 name: hypothesis-example-db

827 path: .hypothesis/examples

828

829 In this workflow, we use `dawidd6/action-download-artifact <https://github.com/dawidd6/action-download-artifact>`_

830 to download the latest artifact given that the official `actions/download-artifact <https://github.com/actions/download-artifact>`_

831 does not support downloading artifacts from previous workflow runs.

832

833 The database automatically implements a simple file-based cache with a default expiration period

834 of 1 day. You can adjust this through the ``cache_timeout`` property.

835

836 For mono-repo support, you can provide a unique ``artifact_name`` (e.g. ``hypofuzz-example-db-frontend``).

837 """

838

839 def __init__(

840 self,

841 owner: str,

842 repo: str,

843 artifact_name: str = "hypothesis-example-db",

844 cache_timeout: timedelta = timedelta(days=1),

845 path: Optional[StrPathT] = None,

846 ):

847 super().__init__()

848 self.owner = owner

849 self.repo = repo

850 self.artifact_name = artifact_name

851 self.cache_timeout = cache_timeout

852

853 # Get the GitHub token from the environment

854 # It's unnecessary to use a token if the repo is public

855 self.token: Optional[str] = getenv("GITHUB_TOKEN")

856

857 if path is None:

858 self.path: Path = Path(

859 storage_directory(f"github-artifacts/{self.artifact_name}/")

860 )

861 else:

862 self.path = Path(path)

863

864 # We don't want to initialize the cache until we need to

865 self._initialized: bool = False

866 self._disabled: bool = False

867

868 # This is the path to the artifact in usage

869 # .hypothesis/github-artifacts/<artifact-name>/<modified_isoformat>.zip

870 self._artifact: Optional[Path] = None

871 # This caches the artifact structure

872 self._access_cache: Optional[dict[PurePath, set[PurePath]]] = None

873

874 # Message to display if user doesn't wrap around ReadOnlyDatabase

875 self._read_only_message = (

876 "This database is read-only. "

877 "Please wrap this class with ReadOnlyDatabase"

878 "i.e. ReadOnlyDatabase(GitHubArtifactDatabase(...))."

879 )

880

881 def __repr__(self) -> str:

882 return (

883 f"GitHubArtifactDatabase(owner={self.owner!r}, "

884 f"repo={self.repo!r}, artifact_name={self.artifact_name!r})"

885 )

886

887 def __eq__(self, other: object) -> bool:

888 return (

889 isinstance(other, GitHubArtifactDatabase)

890 and self.owner == other.owner

891 and self.repo == other.repo

892 and self.artifact_name == other.artifact_name

893 and self.path == other.path

894 )

895

896 def _prepare_for_io(self) -> None:

897 assert self._artifact is not None, "Artifact not loaded."

898

899 if self._initialized: # pragma: no cover

900 return

901

902 # Test that the artifact is valid

903 try:

904 with ZipFile(self._artifact) as f:

905 if f.testzip(): # pragma: no cover

906 raise BadZipFile

907

908 # Turns out that testzip() doesn't work quite well

909 # doing the cache initialization here instead

910 # will give us more coverage of the artifact.

911

912 # Cache the files inside each keypath

913 self._access_cache = {}

914 with ZipFile(self._artifact) as zf:

915 namelist = zf.namelist()

916 # Iterate over files in the artifact

917 for filename in namelist:

918 fileinfo = zf.getinfo(filename)

919 if fileinfo.is_dir():

920 self._access_cache[PurePath(filename)] = set()

921 else:

922 # Get the keypath from the filename

923 keypath = PurePath(filename).parent

924 # Add the file to the keypath

925 self._access_cache[keypath].add(PurePath(filename))

926 except BadZipFile:

927 warnings.warn(

928 "The downloaded artifact from GitHub is invalid. "

929 "This could be because the artifact was corrupted, "

930 "or because the artifact was not created by Hypothesis. ",

931 HypothesisWarning,

932 stacklevel=3,

933 )

934 self._disabled = True

935

936 self._initialized = True

937

938 def _initialize_db(self) -> None:

939 # Trigger warning that we suppressed earlier by intent_to_write=False

940 storage_directory(self.path.name)

941 # Create the cache directory if it doesn't exist

942 self.path.mkdir(exist_ok=True, parents=True)

943

944 # Get all artifacts

945 cached_artifacts = sorted(

946 self.path.glob("*.zip"),

947 key=lambda a: datetime.fromisoformat(a.stem.replace("_", ":")),

948 )

949

950 # Remove all but the latest artifact

951 for artifact in cached_artifacts[:-1]:

952 artifact.unlink()

953

954 try:

955 found_artifact = cached_artifacts[-1]

956 except IndexError:

957 found_artifact = None

958

959 # Check if the latest artifact is a cache hit

960 if found_artifact is not None and (

961 datetime.now(timezone.utc)

962 - datetime.fromisoformat(found_artifact.stem.replace("_", ":"))

963 < self.cache_timeout

964 ):

965 self._artifact = found_artifact

966 else:

967 # Download the latest artifact from GitHub

968 new_artifact = self._fetch_artifact()

969

970 if new_artifact:

971 if found_artifact is not None:

972 found_artifact.unlink()

973 self._artifact = new_artifact

974 elif found_artifact is not None:

975 warnings.warn(

976 "Using an expired artifact as a fallback for the database: "

977 f"{found_artifact}",

978 HypothesisWarning,

979 stacklevel=2,

980 )

981 self._artifact = found_artifact

982 else:

983 warnings.warn(

984 "Couldn't acquire a new or existing artifact. Disabling database.",

985 HypothesisWarning,

986 stacklevel=2,

987 )

988 self._disabled = True

989 return

990

991 self._prepare_for_io()

992

993 def _get_bytes(self, url: str) -> Optional[bytes]: # pragma: no cover

994 request = Request(

995 url,

996 headers={

997 "Accept": "application/vnd.github+json",

998 "X-GitHub-Api-Version": "2022-11-28 ",

999 "Authorization": f"Bearer {self.token}",

1000 },

1001 )

1002 warning_message = None

1003 response_bytes: Optional[bytes] = None

1004 try:

1005 with urlopen(request) as response:

1006 response_bytes = response.read()

1007 except HTTPError as e:

1008 if e.code == 401:

1009 warning_message = (

1010 "Authorization failed when trying to download artifact from GitHub. "

1011 "Check that you have a valid GITHUB_TOKEN set in your environment."

1012 )

1013 else:

1014 warning_message = (

1015 "Could not get the latest artifact from GitHub. "

1016 "This could be because because the repository "

1017 "or artifact does not exist. "

1018 )

1019 except URLError:

1020 warning_message = "Could not connect to GitHub to get the latest artifact. "

1021 except TimeoutError:

1022 warning_message = (

1023 "Could not connect to GitHub to get the latest artifact "

1024 "(connection timed out)."

1025 )

1026

1027 if warning_message is not None:

1028 warnings.warn(warning_message, HypothesisWarning, stacklevel=4)

1029 return None

1030

1031 return response_bytes

1032

1033 def _fetch_artifact(self) -> Optional[Path]: # pragma: no cover

1034 # Get the list of artifacts from GitHub

1035 url = f"https://api.github.com/repos/{self.owner}/{self.repo}/actions/artifacts"

1036 response_bytes = self._get_bytes(url)

1037 if response_bytes is None:

1038 return None

1039

1040 artifacts = json.loads(response_bytes)["artifacts"]

1041 artifacts = [a for a in artifacts if a["name"] == self.artifact_name]

1042

1043 if not artifacts:

1044 return None

1045

1046 # Get the latest artifact from the list

1047 artifact = max(artifacts, key=lambda a: a["created_at"])

1048 url = artifact["archive_download_url"]

1049

1050 # Download the artifact

1051 artifact_bytes = self._get_bytes(url)

1052 if artifact_bytes is None:

1053 return None

1054

1055 # Save the artifact to the cache

1056 # We replace ":" with "_" to ensure the filenames are compatible

1057 # with Windows filesystems

1058 timestamp = datetime.now(timezone.utc).isoformat().replace(":", "_")

1059 artifact_path = self.path / f"{timestamp}.zip"

1060 try:

1061 artifact_path.write_bytes(artifact_bytes)

1062 except OSError:

1063 warnings.warn(

1064 "Could not save the latest artifact from GitHub. ",

1065 HypothesisWarning,

1066 stacklevel=3,

1067 )

1068 return None

1069

1070 return artifact_path

1071

1072 @staticmethod

1073 @lru_cache

1074 def _key_path(key: bytes) -> PurePath:

1075 return PurePath(_hash(key) + "/")

1076

1077 def fetch(self, key: bytes) -> Iterable[bytes]:

1078 if self._disabled:

1079 return

1080

1081 if not self._initialized:

1082 self._initialize_db()

1083 if self._disabled:

1084 return

1085

1086 assert self._artifact is not None

1087 assert self._access_cache is not None

1088

1089 kp = self._key_path(key)

1090

1091 with ZipFile(self._artifact) as zf:

1092 # Get the all files in the the kp from the cache

1093 filenames = self._access_cache.get(kp, ())

1094 for filename in filenames:

1095 with zf.open(filename.as_posix()) as f:

1096 yield f.read()

1097

1098 # Read-only interface

1099 def save(self, key: bytes, value: bytes) -> None:

1100 raise RuntimeError(self._read_only_message)

1101

1102 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1103 raise RuntimeError(self._read_only_message)

1104

1105 def delete(self, key: bytes, value: bytes) -> None:

1106 raise RuntimeError(self._read_only_message)

1107

1108

1109class BackgroundWriteDatabase(ExampleDatabase):

1110 """A wrapper which defers writes on the given database to a background thread.

1111

1112 Calls to :meth:`~hypothesis.database.ExampleDatabase.fetch` wait for any

1113 enqueued writes to finish before fetching from the database.

1114 """

1115

1116 def __init__(self, db: ExampleDatabase) -> None:

1117 super().__init__()

1118 self._db = db

1119 self._queue: Queue[tuple[str, tuple[bytes, ...]]] = Queue()

1120 self._thread: Optional[Thread] = None

1121

1122 def _ensure_thread(self):

1123 if self._thread is None:

1124 self._thread = Thread(target=self._worker, daemon=True)

1125 self._thread.start()

1126 # avoid an unbounded timeout during gc. 0.1 should be plenty for most

1127 # use cases.

1128 weakref.finalize(self, self._join, 0.1)

1129

1130 def __repr__(self) -> str:

1131 return f"BackgroundWriteDatabase({self._db!r})"

1132

1133 def __eq__(self, other: object) -> bool:

1134 return isinstance(other, BackgroundWriteDatabase) and self._db == other._db

1135

1136 def _worker(self) -> None:

1137 while True:

1138 method, args = self._queue.get()

1139 getattr(self._db, method)(*args)

1140 self._queue.task_done()

1141

1142 def _join(self, timeout: Optional[float] = None) -> None:

1143 # copy of Queue.join with a timeout. https://bugs.python.org/issue9634

1144 with self._queue.all_tasks_done:

1145 while self._queue.unfinished_tasks:

1146 self._queue.all_tasks_done.wait(timeout)

1147

1148 def fetch(self, key: bytes) -> Iterable[bytes]:

1149 self._join()

1150 return self._db.fetch(key)

1151

1152 def save(self, key: bytes, value: bytes) -> None:

1153 self._ensure_thread()

1154 self._queue.put(("save", (key, value)))

1155

1156 def delete(self, key: bytes, value: bytes) -> None:

1157 self._ensure_thread()

1158 self._queue.put(("delete", (key, value)))

1159

1160 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1161 self._ensure_thread()

1162 self._queue.put(("move", (src, dest, value)))

1163

1164 def _start_listening(self) -> None:

1165 self._db.add_listener(self._broadcast_change)

1166

1167 def _stop_listening(self) -> None:

1168 self._db.remove_listener(self._broadcast_change)

1169

1170

1171def _pack_uleb128(value: int) -> bytes:

1172 """

1173 Serialize an integer into variable-length bytes. For each byte, the first 7

1174 bits represent (part of) the integer, while the last bit indicates whether the

1175 integer continues into the next byte.

1176

1177 https://en.wikipedia.org/wiki/LEB128

1178 """

1179 parts = bytearray()

1180 assert value >= 0

1181 while True:

1182 # chop off 7 bits

1183 byte = value & ((1 << 7) - 1)

1184 value >>= 7

1185 # set the continuation bit if we have more left

1186 if value:

1187 byte |= 1 << 7

1188

1189 parts.append(byte)

1190 if not value:

1191 break

1192 return bytes(parts)

1193

1194

1195def _unpack_uleb128(buffer: bytes) -> tuple[int, int]:

1196 """

1197 Inverts _pack_uleb128, and also returns the index at which at which we stopped

1198 reading.

1199 """

1200 value = 0

1201 for i, byte in enumerate(buffer):

1202 n = byte & ((1 << 7) - 1)

1203 value |= n << (i * 7)

1204

1205 if not byte >> 7:

1206 break

1207 return (i + 1, value)

1208

1209

1210def choices_to_bytes(choices: Iterable[ChoiceT], /) -> bytes:

1211 """Serialize a list of choices to a bytestring. Inverts choices_from_bytes."""

1212 # We use a custom serialization format for this, which might seem crazy - but our

1213 # data is a flat sequence of elements, and standard tools like protobuf or msgpack

1214 # don't deal well with e.g. nonstandard bit-pattern-NaNs, or invalid-utf8 unicode.

1215 #

1216 # We simply encode each element with a metadata byte, if needed a uint16 size, and

1217 # then the payload bytes. For booleans, the payload is inlined into the metadata.

1218 parts = []

1219 for choice in choices:

1220 if isinstance(choice, bool):

1221 # `000_0000v` - tag zero, low bit payload.

1222 parts.append(b"\1" if choice else b"\0")

1223 continue

1224

1225 # `tag_ssss [uint16 size?] [payload]`

1226 if isinstance(choice, float):

1227 tag = 1 << 5

1228 choice = struct.pack("!d", choice)

1229 elif isinstance(choice, int):

1230 tag = 2 << 5

1231 choice = choice.to_bytes(1 + choice.bit_length() // 8, "big", signed=True)

1232 elif isinstance(choice, bytes):

1233 tag = 3 << 5

1234 else:

1235 assert isinstance(choice, str)

1236 tag = 4 << 5

1237 choice = choice.encode(errors="surrogatepass")

1238

1239 size = len(choice)

1240 if size < 0b11111:

1241 parts.append((tag | size).to_bytes(1, "big"))

1242 else:

1243 parts.append((tag | 0b11111).to_bytes(1, "big"))

1244 parts.append(_pack_uleb128(size))

1245 parts.append(choice)

1246

1247 return b"".join(parts)

1248

1249

1250def _choices_from_bytes(buffer: bytes, /) -> tuple[ChoiceT, ...]:

1251 # See above for an explanation of the format.

1252 parts: list[ChoiceT] = []

1253 idx = 0

1254 while idx < len(buffer):

1255 tag = buffer[idx] >> 5

1256 size = buffer[idx] & 0b11111

1257 idx += 1

1258

1259 if tag == 0:

1260 parts.append(bool(size))

1261 continue

1262 if size == 0b11111:

1263 (offset, size) = _unpack_uleb128(buffer[idx:])

1264 idx += offset

1265 chunk = buffer[idx : idx + size]

1266 idx += size

1267

1268 if tag == 1:

1269 assert size == 8, "expected float64"

1270 parts.extend(struct.unpack("!d", chunk))

1271 elif tag == 2:

1272 parts.append(int.from_bytes(chunk, "big", signed=True))

1273 elif tag == 3:

1274 parts.append(chunk)

1275 else:

1276 assert tag == 4

1277 parts.append(chunk.decode(errors="surrogatepass"))

1278 return tuple(parts)

1279

1280

1281def choices_from_bytes(buffer: bytes, /) -> Optional[tuple[ChoiceT, ...]]:

1282 """

1283 Deserialize a bytestring to a tuple of choices. Inverts choices_to_bytes.

1284

1285 Returns None if the given bytestring is not a valid serialization of choice

1286 sequences.

1287 """

1288 try:

1289 return _choices_from_bytes(buffer)

1290 except Exception:

1291 # deserialization error, eg because our format changed or someone put junk

1292 # data in the db.

1293 return None