Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/database.py: 32%

782 this would usually be a `Personal Access Token <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_.

783 If the repository is private, it's necessary for the token to have ``repo`` scope

784 in the case of a classic token, or ``actions:read`` in the case of a fine-grained token.

785

786

787 In most cases, this will be used

788 through the :class:`~hypothesis.database.MultiplexedDatabase`,

789 by combining a local directory-based database with this one. For example:

790

791 .. code-block:: python

792

793 local = DirectoryBasedExampleDatabase(".hypothesis/examples")

794 shared = ReadOnlyDatabase(GitHubArtifactDatabase("user", "repo"))

795

796 settings.register_profile("ci", database=local)

797 settings.register_profile("dev", database=MultiplexedDatabase(local, shared))

798 # We don't want to use the shared database in CI, only to populate its local one.

799 # which the workflow should then upload as an artifact.

800 settings.load_profile("ci" if os.environ.get("CI") else "dev")

801

802 .. note::

803 Because this database is read-only, you always need to wrap it with the

804 :class:`ReadOnlyDatabase`.

805

806 A setup like this can be paired with a GitHub Actions workflow including

807 something like the following:

808

809 .. code-block:: yaml

810

811 - name: Download example database

812 uses: dawidd6/action-download-artifact@v9

813 with:

814 name: hypothesis-example-db

815 path: .hypothesis/examples

816 if_no_artifact_found: warn

817 workflow_conclusion: completed

818

819 - name: Run tests

820 run: pytest

821

822 - name: Upload example database

823 uses: actions/upload-artifact@v3

824 if: always()

825 with:

826 name: hypothesis-example-db

827 path: .hypothesis/examples

828

829 In this workflow, we use `dawidd6/action-download-artifact <https://github.com/dawidd6/action-download-artifact>`_

830 to download the latest artifact given that the official `actions/download-artifact <https://github.com/actions/download-artifact>`_

831 does not support downloading artifacts from previous workflow runs.

832

833 The database automatically implements a simple file-based cache with a default expiration period

834 of 1 day. You can adjust this through the ``cache_timeout`` property.

835

836 For mono-repo support, you can provide a unique ``artifact_name`` (e.g. ``hypofuzz-example-db-frontend``).

837 """

838

839 def __init__(

840 self,

841 owner: str,

842 repo: str,

843 artifact_name: str = "hypothesis-example-db",

844 cache_timeout: timedelta = timedelta(days=1),

845 path: Optional[StrPathT] = None,

846 ):

847 super().__init__()

848 self.owner = owner

849 self.repo = repo

850 self.artifact_name = artifact_name

851 self.cache_timeout = cache_timeout

852

853 # Get the GitHub token from the environment

854 # It's unnecessary to use a token if the repo is public

855 self.token: Optional[str] = getenv("GITHUB_TOKEN")

856

857 if path is None:

858 self.path: Path = Path(

859 storage_directory(f"github-artifacts/{self.artifact_name}/")

860 )

861 else:

862 self.path = Path(path)

863

864 # We don't want to initialize the cache until we need to

865 self._initialized: bool = False

866 self._disabled: bool = False

867

868 # This is the path to the artifact in usage

869 # .hypothesis/github-artifacts/<artifact-name>/<modified_isoformat>.zip

870 self._artifact: Optional[Path] = None

871 # This caches the artifact structure

872 self._access_cache: Optional[dict[PurePath, set[PurePath]]] = None

873

874 # Message to display if user doesn't wrap around ReadOnlyDatabase

875 self._read_only_message = (

876 "This database is read-only. "

877 "Please wrap this class with ReadOnlyDatabase"

878 "i.e. ReadOnlyDatabase(GitHubArtifactDatabase(...))."

879 )

880

881 def __repr__(self) -> str:

882 return (

883 f"GitHubArtifactDatabase(owner={self.owner!r}, "

884 f"repo={self.repo!r}, artifact_name={self.artifact_name!r})"

885 )

886

887 def __eq__(self, other: object) -> bool:

888 return (

889 isinstance(other, GitHubArtifactDatabase)

890 and self.owner == other.owner

891 and self.repo == other.repo

892 and self.artifact_name == other.artifact_name

893 and self.path == other.path

894 )

895

896 def _prepare_for_io(self) -> None:

897 assert self._artifact is not None, "Artifact not loaded."

898

899 if self._initialized: # pragma: no cover

900 return

901

902 # Test that the artifact is valid

903 try:

904 with ZipFile(self._artifact) as f:

905 if f.testzip(): # pragma: no cover

906 raise BadZipFile

907

908 # Turns out that testzip() doesn't work quite well

909 # doing the cache initialization here instead

910 # will give us more coverage of the artifact.

911

912 # Cache the files inside each keypath

913 self._access_cache = {}

914 with ZipFile(self._artifact) as zf:

915 namelist = zf.namelist()

916 # Iterate over files in the artifact

917 for filename in namelist:

918 fileinfo = zf.getinfo(filename)

919 if fileinfo.is_dir():

920 self._access_cache[PurePath(filename)] = set()

921 else:

922 # Get the keypath from the filename

923 keypath = PurePath(filename).parent

924 # Add the file to the keypath

925 self._access_cache[keypath].add(PurePath(filename))

926 except BadZipFile:

927 warnings.warn(

928 "The downloaded artifact from GitHub is invalid. "

929 "This could be because the artifact was corrupted, "

930 "or because the artifact was not created by Hypothesis. ",

931 HypothesisWarning,

932 stacklevel=3,

933 )

934 self._disabled = True

935

936 self._initialized = True

937

938 def _initialize_db(self) -> None:

939 # Trigger warning that we suppressed earlier by intent_to_write=False

940 storage_directory(self.path.name)

941 # Create the cache directory if it doesn't exist

942 self.path.mkdir(exist_ok=True, parents=True)

943

944 # Get all artifacts

945 cached_artifacts = sorted(

946 self.path.glob("*.zip"),

947 key=lambda a: datetime.fromisoformat(a.stem.replace("_", ":")),

948 )

949

950 # Remove all but the latest artifact

951 for artifact in cached_artifacts[:-1]:

952 artifact.unlink()

953

954 try:

955 found_artifact = cached_artifacts[-1]

956 except IndexError:

957 found_artifact = None

958

959 # Check if the latest artifact is a cache hit

960 if found_artifact is not None and (

961 datetime.now(timezone.utc)

962 - datetime.fromisoformat(found_artifact.stem.replace("_", ":"))

963 < self.cache_timeout

964 ):

965 self._artifact = found_artifact

966 else:

967 # Download the latest artifact from GitHub

968 new_artifact = self._fetch_artifact()

969

970 if new_artifact:

971 if found_artifact is not None:

972 found_artifact.unlink()

973 self._artifact = new_artifact

974 elif found_artifact is not None:

975 warnings.warn(

976 "Using an expired artifact as a fallback for the database: "

977 f"{found_artifact}",

978 HypothesisWarning,

979 stacklevel=2,

980 )

981 self._artifact = found_artifact

982 else:

983 warnings.warn(

984 "Couldn't acquire a new or existing artifact. Disabling database.",

985 HypothesisWarning,

986 stacklevel=2,

987 )

988 self._disabled = True

989 return

990

991 self._prepare_for_io()

992

993 def _get_bytes(self, url: str) -> Optional[bytes]: # pragma: no cover

994 request = Request(

995 url,

996 headers={

997 "Accept": "application/vnd.github+json",

998 "X-GitHub-Api-Version": "2022-11-28 ",

999 "Authorization": f"Bearer {self.token}",

1000 },

1001 )

1002 warning_message = None

1003 response_bytes: Optional[bytes] = None

1004 try:

1005 with urlopen(request) as response:

1006 response_bytes = response.read()

1007 except HTTPError as e:

1008 if e.code == 401:

1009 warning_message = (

1010 "Authorization failed when trying to download artifact from GitHub. "

1011 "Check that you have a valid GITHUB_TOKEN set in your environment."

1012 )

1013 else:

1014 warning_message = (

1015 "Could not get the latest artifact from GitHub. "

1016 "This could be because because the repository "

1017 "or artifact does not exist. "

1018 )

1019 # see https://github.com/python/cpython/issues/128734

1020 e.close()

1021 except URLError:

1022 warning_message = "Could not connect to GitHub to get the latest artifact. "

1023 except TimeoutError:

1024 warning_message = (

1025 "Could not connect to GitHub to get the latest artifact "

1026 "(connection timed out)."

1027 )

1028

1029 if warning_message is not None:

1030 warnings.warn(warning_message, HypothesisWarning, stacklevel=4)

1031 return None

1032

1033 return response_bytes

1034

1035 def _fetch_artifact(self) -> Optional[Path]: # pragma: no cover

1036 # Get the list of artifacts from GitHub

1037 url = f"https://api.github.com/repos/{self.owner}/{self.repo}/actions/artifacts"

1038 response_bytes = self._get_bytes(url)

1039 if response_bytes is None:

1040 return None

1041

1042 artifacts = json.loads(response_bytes)["artifacts"]

1043 artifacts = [a for a in artifacts if a["name"] == self.artifact_name]

1044

1045 if not artifacts:

1046 return None

1047

1048 # Get the latest artifact from the list

1049 artifact = max(artifacts, key=lambda a: a["created_at"])

1050 url = artifact["archive_download_url"]

1051

1052 # Download the artifact

1053 artifact_bytes = self._get_bytes(url)

1054 if artifact_bytes is None:

1055 return None

1056

1057 # Save the artifact to the cache

1058 # We replace ":" with "_" to ensure the filenames are compatible

1059 # with Windows filesystems

1060 timestamp = datetime.now(timezone.utc).isoformat().replace(":", "_")

1061 artifact_path = self.path / f"{timestamp}.zip"

1062 try:

1063 artifact_path.write_bytes(artifact_bytes)

1064 except OSError:

1065 warnings.warn(

1066 "Could not save the latest artifact from GitHub. ",

1067 HypothesisWarning,

1068 stacklevel=3,

1069 )

1070 return None

1071

1072 return artifact_path

1073

1074 @staticmethod

1075 @lru_cache

1076 def _key_path(key: bytes) -> PurePath:

1077 return PurePath(_hash(key) + "/")

1078

1079 def fetch(self, key: bytes) -> Iterable[bytes]:

1080 if self._disabled:

1081 return

1082

1083 if not self._initialized:

1084 self._initialize_db()

1085 if self._disabled:

1086 return

1087

1088 assert self._artifact is not None

1089 assert self._access_cache is not None

1090

1091 kp = self._key_path(key)

1092

1093 with ZipFile(self._artifact) as zf:

1094 # Get the all files in the the kp from the cache

1095 filenames = self._access_cache.get(kp, ())

1096 for filename in filenames:

1097 with zf.open(filename.as_posix()) as f:

1098 yield f.read()

1099

1100 # Read-only interface

1101 def save(self, key: bytes, value: bytes) -> None:

1102 raise RuntimeError(self._read_only_message)

1103

1104 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1105 raise RuntimeError(self._read_only_message)

1106

1107 def delete(self, key: bytes, value: bytes) -> None:

1108 raise RuntimeError(self._read_only_message)

1109

1110

1111class BackgroundWriteDatabase(ExampleDatabase):

1112 """A wrapper which defers writes on the given database to a background thread.

1113

1114 Calls to :meth:`~hypothesis.database.ExampleDatabase.fetch` wait for any

1115 enqueued writes to finish before fetching from the database.

1116 """

1117

1118 def __init__(self, db: ExampleDatabase) -> None:

1119 super().__init__()

1120 self._db = db

1121 self._queue: Queue[tuple[str, tuple[bytes, ...]]] = Queue()

1122 self._thread: Optional[Thread] = None

1123

1124 def _ensure_thread(self):

1125 if self._thread is None:

1126 self._thread = Thread(target=self._worker, daemon=True)

1127 self._thread.start()

1128 # avoid an unbounded timeout during gc. 0.1 should be plenty for most

1129 # use cases.

1130 weakref.finalize(self, self._join, 0.1)

1131

1132 def __repr__(self) -> str:

1133 return f"BackgroundWriteDatabase({self._db!r})"

1134

1135 def __eq__(self, other: object) -> bool:

1136 return isinstance(other, BackgroundWriteDatabase) and self._db == other._db

1137

1138 def _worker(self) -> None:

1139 while True:

1140 method, args = self._queue.get()

1141 getattr(self._db, method)(*args)

1142 self._queue.task_done()

1143

1144 def _join(self, timeout: Optional[float] = None) -> None:

1145 # copy of Queue.join with a timeout. https://bugs.python.org/issue9634

1146 with self._queue.all_tasks_done:

1147 while self._queue.unfinished_tasks:

1148 self._queue.all_tasks_done.wait(timeout)

1149

1150 def fetch(self, key: bytes) -> Iterable[bytes]:

1151 self._join()

1152 return self._db.fetch(key)

1153

1154 def save(self, key: bytes, value: bytes) -> None:

1155 self._ensure_thread()

1156 self._queue.put(("save", (key, value)))

1157

1158 def delete(self, key: bytes, value: bytes) -> None:

1159 self._ensure_thread()

1160 self._queue.put(("delete", (key, value)))

1161

1162 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1163 self._ensure_thread()

1164 self._queue.put(("move", (src, dest, value)))

1165

1166 def _start_listening(self) -> None:

1167 self._db.add_listener(self._broadcast_change)

1168

1169 def _stop_listening(self) -> None:

1170 self._db.remove_listener(self._broadcast_change)

1171

1172

1173def _pack_uleb128(value: int) -> bytes:

1174 """

1175 Serialize an integer into variable-length bytes. For each byte, the first 7

1176 bits represent (part of) the integer, while the last bit indicates whether the

1177 integer continues into the next byte.

1178

1179 https://en.wikipedia.org/wiki/LEB128

1180 """

1181 parts = bytearray()

1182 assert value >= 0

1183 while True:

1184 # chop off 7 bits

1185 byte = value & ((1 << 7) - 1)

1186 value >>= 7

1187 # set the continuation bit if we have more left

1188 if value:

1189 byte |= 1 << 7

1190

1191 parts.append(byte)

1192 if not value:

1193 break

1194 return bytes(parts)

1195

1196

1197def _unpack_uleb128(buffer: bytes) -> tuple[int, int]:

1198 """

1199 Inverts _pack_uleb128, and also returns the index at which at which we stopped

1200 reading.

1201 """

1202 value = 0

1203 for i, byte in enumerate(buffer):

1204 n = byte & ((1 << 7) - 1)

1205 value |= n << (i * 7)

1206

1207 if not byte >> 7:

1208 break

1209 return (i + 1, value)

1210

1211

1212def choices_to_bytes(choices: Iterable[ChoiceT], /) -> bytes:

1213 """Serialize a list of choices to a bytestring. Inverts choices_from_bytes."""

1214 # We use a custom serialization format for this, which might seem crazy - but our

1215 # data is a flat sequence of elements, and standard tools like protobuf or msgpack

1216 # don't deal well with e.g. nonstandard bit-pattern-NaNs, or invalid-utf8 unicode.

1217 #

1218 # We simply encode each element with a metadata byte, if needed a uint16 size, and

1219 # then the payload bytes. For booleans, the payload is inlined into the metadata.

1220 parts = []

1221 for choice in choices:

1222 if isinstance(choice, bool):

1223 # `000_0000v` - tag zero, low bit payload.

1224 parts.append(b"\1" if choice else b"\0")

1225 continue

1226

1227 # `tag_ssss [uint16 size?] [payload]`

1228 if isinstance(choice, float):

1229 tag = 1 << 5

1230 choice = struct.pack("!d", choice)

1231 elif isinstance(choice, int):

1232 tag = 2 << 5

1233 choice = choice.to_bytes(1 + choice.bit_length() // 8, "big", signed=True)

1234 elif isinstance(choice, bytes):

1235 tag = 3 << 5

1236 else:

1237 assert isinstance(choice, str)

1238 tag = 4 << 5

1239 choice = choice.encode(errors="surrogatepass")

1240

1241 size = len(choice)

1242 if size < 0b11111:

1243 parts.append((tag | size).to_bytes(1, "big"))

1244 else:

1245 parts.append((tag | 0b11111).to_bytes(1, "big"))

1246 parts.append(_pack_uleb128(size))

1247 parts.append(choice)

1248

1249 return b"".join(parts)

1250

1251

1252def _choices_from_bytes(buffer: bytes, /) -> tuple[ChoiceT, ...]:

1253 # See above for an explanation of the format.

1254 parts: list[ChoiceT] = []

1255 idx = 0

1256 while idx < len(buffer):

1257 tag = buffer[idx] >> 5

1258 size = buffer[idx] & 0b11111

1259 idx += 1

1260

1261 if tag == 0:

1262 parts.append(bool(size))

1263 continue

1264 if size == 0b11111:

1265 (offset, size) = _unpack_uleb128(buffer[idx:])

1266 idx += offset

1267 chunk = buffer[idx : idx + size]

1268 idx += size

1269

1270 if tag == 1:

1271 assert size == 8, "expected float64"

1272 parts.extend(struct.unpack("!d", chunk))

1273 elif tag == 2:

1274 parts.append(int.from_bytes(chunk, "big", signed=True))

1275 elif tag == 3:

1276 parts.append(chunk)

1277 else:

1278 assert tag == 4

1279 parts.append(chunk.decode(errors="surrogatepass"))

1280 return tuple(parts)

1281

1282

1283def choices_from_bytes(buffer: bytes, /) -> Optional[tuple[ChoiceT, ...]]:

1284 """

1285 Deserialize a bytestring to a tuple of choices. Inverts choices_to_bytes.

1286

1287 Returns None if the given bytestring is not a valid serialization of choice

1288 sequences.

1289 """

1290 try:

1291 return _choices_from_bytes(buffer)

1292 except Exception:

1293 # deserialization error, eg because our format changed or someone put junk

1294 # data in the db.

1295 return None