Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/database.py: 32%

775 this would usually be a `Personal Access Token <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens>`_.

776 If the repository is private, it's necessary for the token to have ``repo`` scope

777 in the case of a classic token, or ``actions:read`` in the case of a fine-grained token.

778

779

780 In most cases, this will be used

781 through the :class:`~hypothesis.database.MultiplexedDatabase`,

782 by combining a local directory-based database with this one. For example:

783

784 .. code-block:: python

785

786 local = DirectoryBasedExampleDatabase(".hypothesis/examples")

787 shared = ReadOnlyDatabase(GitHubArtifactDatabase("user", "repo"))

788

789 settings.register_profile("ci", database=local)

790 settings.register_profile("dev", database=MultiplexedDatabase(local, shared))

791 # We don't want to use the shared database in CI, only to populate its local one.

792 # which the workflow should then upload as an artifact.

793 settings.load_profile("ci" if os.environ.get("CI") else "dev")

794

795 .. note::

796 Because this database is read-only, you always need to wrap it with the

797 :class:`ReadOnlyDatabase`.

798

799 A setup like this can be paired with a GitHub Actions workflow including

800 something like the following:

801

802 .. code-block:: yaml

803

804 - name: Download example database

805 uses: dawidd6/action-download-artifact@v9

806 with:

807 name: hypothesis-example-db

808 path: .hypothesis/examples

809 if_no_artifact_found: warn

810 workflow_conclusion: completed

811

812 - name: Run tests

813 run: pytest

814

815 - name: Upload example database

816 uses: actions/upload-artifact@v3

817 if: always()

818 with:

819 name: hypothesis-example-db

820 path: .hypothesis/examples

821

822 In this workflow, we use `dawidd6/action-download-artifact <https://github.com/dawidd6/action-download-artifact>`_

823 to download the latest artifact given that the official `actions/download-artifact <https://github.com/actions/download-artifact>`_

824 does not support downloading artifacts from previous workflow runs.

825

826 The database automatically implements a simple file-based cache with a default expiration period

827 of 1 day. You can adjust this through the ``cache_timeout`` property.

828

829 For mono-repo support, you can provide a unique ``artifact_name`` (e.g. ``hypofuzz-example-db-frontend``).

830 """

831

832 def __init__(

833 self,

834 owner: str,

835 repo: str,

836 artifact_name: str = "hypothesis-example-db",

837 cache_timeout: timedelta = timedelta(days=1),

838 path: Optional[StrPathT] = None,

839 ):

840 super().__init__()

841 self.owner = owner

842 self.repo = repo

843 self.artifact_name = artifact_name

844 self.cache_timeout = cache_timeout

845

846 # Get the GitHub token from the environment

847 # It's unnecessary to use a token if the repo is public

848 self.token: Optional[str] = getenv("GITHUB_TOKEN")

849

850 if path is None:

851 self.path: Path = Path(

852 storage_directory(f"github-artifacts/{self.artifact_name}/")

853 )

854 else:

855 self.path = Path(path)

856

857 # We don't want to initialize the cache until we need to

858 self._initialized: bool = False

859 self._disabled: bool = False

860

861 # This is the path to the artifact in usage

862 # .hypothesis/github-artifacts/<artifact-name>/<modified_isoformat>.zip

863 self._artifact: Optional[Path] = None

864 # This caches the artifact structure

865 self._access_cache: Optional[dict[PurePath, set[PurePath]]] = None

866

867 # Message to display if user doesn't wrap around ReadOnlyDatabase

868 self._read_only_message = (

869 "This database is read-only. "

870 "Please wrap this class with ReadOnlyDatabase"

871 "i.e. ReadOnlyDatabase(GitHubArtifactDatabase(...))."

872 )

873

874 def __repr__(self) -> str:

875 return (

876 f"GitHubArtifactDatabase(owner={self.owner!r}, "

877 f"repo={self.repo!r}, artifact_name={self.artifact_name!r})"

878 )

879

880 def __eq__(self, other: object) -> bool:

881 return (

882 isinstance(other, GitHubArtifactDatabase)

883 and self.owner == other.owner

884 and self.repo == other.repo

885 and self.artifact_name == other.artifact_name

886 and self.path == other.path

887 )

888

889 def _prepare_for_io(self) -> None:

890 assert self._artifact is not None, "Artifact not loaded."

891

892 if self._initialized: # pragma: no cover

893 return

894

895 # Test that the artifact is valid

896 try:

897 with ZipFile(self._artifact) as f:

898 if f.testzip(): # pragma: no cover

899 raise BadZipFile

900

901 # Turns out that testzip() doesn't work quite well

902 # doing the cache initialization here instead

903 # will give us more coverage of the artifact.

904

905 # Cache the files inside each keypath

906 self._access_cache = {}

907 with ZipFile(self._artifact) as zf:

908 namelist = zf.namelist()

909 # Iterate over files in the artifact

910 for filename in namelist:

911 fileinfo = zf.getinfo(filename)

912 if fileinfo.is_dir():

913 self._access_cache[PurePath(filename)] = set()

914 else:

915 # Get the keypath from the filename

916 keypath = PurePath(filename).parent

917 # Add the file to the keypath

918 self._access_cache[keypath].add(PurePath(filename))

919 except BadZipFile:

920 warnings.warn(

921 "The downloaded artifact from GitHub is invalid. "

922 "This could be because the artifact was corrupted, "

923 "or because the artifact was not created by Hypothesis. ",

924 HypothesisWarning,

925 stacklevel=3,

926 )

927 self._disabled = True

928

929 self._initialized = True

930

931 def _initialize_db(self) -> None:

932 # Trigger warning that we suppressed earlier by intent_to_write=False

933 storage_directory(self.path.name)

934 # Create the cache directory if it doesn't exist

935 self.path.mkdir(exist_ok=True, parents=True)

936

937 # Get all artifacts

938 cached_artifacts = sorted(

939 self.path.glob("*.zip"),

940 key=lambda a: datetime.fromisoformat(a.stem.replace("_", ":")),

941 )

942

943 # Remove all but the latest artifact

944 for artifact in cached_artifacts[:-1]:

945 artifact.unlink()

946

947 try:

948 found_artifact = cached_artifacts[-1]

949 except IndexError:

950 found_artifact = None

951

952 # Check if the latest artifact is a cache hit

953 if found_artifact is not None and (

954 datetime.now(timezone.utc)

955 - datetime.fromisoformat(found_artifact.stem.replace("_", ":"))

956 < self.cache_timeout

957 ):

958 self._artifact = found_artifact

959 else:

960 # Download the latest artifact from GitHub

961 new_artifact = self._fetch_artifact()

962

963 if new_artifact:

964 if found_artifact is not None:

965 found_artifact.unlink()

966 self._artifact = new_artifact

967 elif found_artifact is not None:

968 warnings.warn(

969 "Using an expired artifact as a fallback for the database: "

970 f"{found_artifact}",

971 HypothesisWarning,

972 stacklevel=2,

973 )

974 self._artifact = found_artifact

975 else:

976 warnings.warn(

977 "Couldn't acquire a new or existing artifact. Disabling database.",

978 HypothesisWarning,

979 stacklevel=2,

980 )

981 self._disabled = True

982 return

983

984 self._prepare_for_io()

985

986 def _get_bytes(self, url: str) -> Optional[bytes]: # pragma: no cover

987 request = Request(

988 url,

989 headers={

990 "Accept": "application/vnd.github+json",

991 "X-GitHub-Api-Version": "2022-11-28 ",

992 "Authorization": f"Bearer {self.token}",

993 },

994 )

995 warning_message = None

996 response_bytes: Optional[bytes] = None

997 try:

998 with urlopen(request) as response:

999 response_bytes = response.read()

1000 except HTTPError as e:

1001 if e.code == 401:

1002 warning_message = (

1003 "Authorization failed when trying to download artifact from GitHub. "

1004 "Check that you have a valid GITHUB_TOKEN set in your environment."

1005 )

1006 else:

1007 warning_message = (

1008 "Could not get the latest artifact from GitHub. "

1009 "This could be because because the repository "

1010 "or artifact does not exist. "

1011 )

1012 except URLError:

1013 warning_message = "Could not connect to GitHub to get the latest artifact. "

1014 except TimeoutError:

1015 warning_message = (

1016 "Could not connect to GitHub to get the latest artifact "

1017 "(connection timed out)."

1018 )

1019

1020 if warning_message is not None:

1021 warnings.warn(warning_message, HypothesisWarning, stacklevel=4)

1022 return None

1023

1024 return response_bytes

1025

1026 def _fetch_artifact(self) -> Optional[Path]: # pragma: no cover

1027 # Get the list of artifacts from GitHub

1028 url = f"https://api.github.com/repos/{self.owner}/{self.repo}/actions/artifacts"

1029 response_bytes = self._get_bytes(url)

1030 if response_bytes is None:

1031 return None

1032

1033 artifacts = json.loads(response_bytes)["artifacts"]

1034 artifacts = [a for a in artifacts if a["name"] == self.artifact_name]

1035

1036 if not artifacts:

1037 return None

1038

1039 # Get the latest artifact from the list

1040 artifact = max(artifacts, key=lambda a: a["created_at"])

1041 url = artifact["archive_download_url"]

1042

1043 # Download the artifact

1044 artifact_bytes = self._get_bytes(url)

1045 if artifact_bytes is None:

1046 return None

1047

1048 # Save the artifact to the cache

1049 # We replace ":" with "_" to ensure the filenames are compatible

1050 # with Windows filesystems

1051 timestamp = datetime.now(timezone.utc).isoformat().replace(":", "_")

1052 artifact_path = self.path / f"{timestamp}.zip"

1053 try:

1054 artifact_path.write_bytes(artifact_bytes)

1055 except OSError:

1056 warnings.warn(

1057 "Could not save the latest artifact from GitHub. ",

1058 HypothesisWarning,

1059 stacklevel=3,

1060 )

1061 return None

1062

1063 return artifact_path

1064

1065 @staticmethod

1066 @lru_cache

1067 def _key_path(key: bytes) -> PurePath:

1068 return PurePath(_hash(key) + "/")

1069

1070 def fetch(self, key: bytes) -> Iterable[bytes]:

1071 if self._disabled:

1072 return

1073

1074 if not self._initialized:

1075 self._initialize_db()

1076 if self._disabled:

1077 return

1078

1079 assert self._artifact is not None

1080 assert self._access_cache is not None

1081

1082 kp = self._key_path(key)

1083

1084 with ZipFile(self._artifact) as zf:

1085 # Get the all files in the the kp from the cache

1086 filenames = self._access_cache.get(kp, ())

1087 for filename in filenames:

1088 with zf.open(filename.as_posix()) as f:

1089 yield f.read()

1090

1091 # Read-only interface

1092 def save(self, key: bytes, value: bytes) -> None:

1093 raise RuntimeError(self._read_only_message)

1094

1095 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1096 raise RuntimeError(self._read_only_message)

1097

1098 def delete(self, key: bytes, value: bytes) -> None:

1099 raise RuntimeError(self._read_only_message)

1100

1101

1102class BackgroundWriteDatabase(ExampleDatabase):

1103 """A wrapper which defers writes on the given database to a background thread.

1104

1105 Calls to :meth:`~hypothesis.database.ExampleDatabase.fetch` wait for any

1106 enqueued writes to finish before fetching from the database.

1107 """

1108

1109 def __init__(self, db: ExampleDatabase) -> None:

1110 super().__init__()

1111 self._db = db

1112 self._queue: Queue[tuple[str, tuple[bytes, ...]]] = Queue()

1113 self._thread: Optional[Thread] = None

1114

1115 def _ensure_thread(self):

1116 if self._thread is None:

1117 self._thread = Thread(target=self._worker, daemon=True)

1118 self._thread.start()

1119 # avoid an unbounded timeout during gc. 0.1 should be plenty for most

1120 # use cases.

1121 weakref.finalize(self, self._join, 0.1)

1122

1123 def __repr__(self) -> str:

1124 return f"BackgroundWriteDatabase({self._db!r})"

1125

1126 def __eq__(self, other: object) -> bool:

1127 return isinstance(other, BackgroundWriteDatabase) and self._db == other._db

1128

1129 def _worker(self) -> None:

1130 while True:

1131 method, args = self._queue.get()

1132 getattr(self._db, method)(*args)

1133 self._queue.task_done()

1134

1135 def _join(self, timeout: Optional[float] = None) -> None:

1136 # copy of Queue.join with a timeout. https://bugs.python.org/issue9634

1137 with self._queue.all_tasks_done:

1138 while self._queue.unfinished_tasks:

1139 self._queue.all_tasks_done.wait(timeout)

1140

1141 def fetch(self, key: bytes) -> Iterable[bytes]:

1142 self._join()

1143 return self._db.fetch(key)

1144

1145 def save(self, key: bytes, value: bytes) -> None:

1146 self._ensure_thread()

1147 self._queue.put(("save", (key, value)))

1148

1149 def delete(self, key: bytes, value: bytes) -> None:

1150 self._ensure_thread()

1151 self._queue.put(("delete", (key, value)))

1152

1153 def move(self, src: bytes, dest: bytes, value: bytes) -> None:

1154 self._ensure_thread()

1155 self._queue.put(("move", (src, dest, value)))

1156

1157 def _start_listening(self) -> None:

1158 self._db.add_listener(self._broadcast_change)

1159

1160 def _stop_listening(self) -> None:

1161 self._db.remove_listener(self._broadcast_change)

1162

1163

1164def _pack_uleb128(value: int) -> bytes:

1165 """

1166 Serialize an integer into variable-length bytes. For each byte, the first 7

1167 bits represent (part of) the integer, while the last bit indicates whether the

1168 integer continues into the next byte.

1169

1170 https://en.wikipedia.org/wiki/LEB128

1171 """

1172 parts = bytearray()

1173 assert value >= 0

1174 while True:

1175 # chop off 7 bits

1176 byte = value & ((1 << 7) - 1)

1177 value >>= 7

1178 # set the continuation bit if we have more left

1179 if value:

1180 byte |= 1 << 7

1181

1182 parts.append(byte)

1183 if not value:

1184 break

1185 return bytes(parts)

1186

1187

1188def _unpack_uleb128(buffer: bytes) -> tuple[int, int]:

1189 """

1190 Inverts _pack_uleb128, and also returns the index at which at which we stopped

1191 reading.

1192 """

1193 value = 0

1194 for i, byte in enumerate(buffer):

1195 n = byte & ((1 << 7) - 1)

1196 value |= n << (i * 7)

1197

1198 if not byte >> 7:

1199 break

1200 return (i + 1, value)

1201

1202

1203def choices_to_bytes(choices: Iterable[ChoiceT], /) -> bytes:

1204 """Serialize a list of choices to a bytestring. Inverts choices_from_bytes."""

1205 # We use a custom serialization format for this, which might seem crazy - but our

1206 # data is a flat sequence of elements, and standard tools like protobuf or msgpack

1207 # don't deal well with e.g. nonstandard bit-pattern-NaNs, or invalid-utf8 unicode.

1208 #

1209 # We simply encode each element with a metadata byte, if needed a uint16 size, and

1210 # then the payload bytes. For booleans, the payload is inlined into the metadata.

1211 parts = []

1212 for choice in choices:

1213 if isinstance(choice, bool):

1214 # `000_0000v` - tag zero, low bit payload.

1215 parts.append(b"\1" if choice else b"\0")

1216 continue

1217

1218 # `tag_ssss [uint16 size?] [payload]`

1219 if isinstance(choice, float):

1220 tag = 1 << 5

1221 choice = struct.pack("!d", choice)

1222 elif isinstance(choice, int):

1223 tag = 2 << 5

1224 choice = choice.to_bytes(1 + choice.bit_length() // 8, "big", signed=True)

1225 elif isinstance(choice, bytes):

1226 tag = 3 << 5

1227 else:

1228 assert isinstance(choice, str)

1229 tag = 4 << 5

1230 choice = choice.encode(errors="surrogatepass")

1231

1232 size = len(choice)

1233 if size < 0b11111:

1234 parts.append((tag | size).to_bytes(1, "big"))

1235 else:

1236 parts.append((tag | 0b11111).to_bytes(1, "big"))

1237 parts.append(_pack_uleb128(size))

1238 parts.append(choice)

1239

1240 return b"".join(parts)

1241

1242

1243def _choices_from_bytes(buffer: bytes, /) -> tuple[ChoiceT, ...]:

1244 # See above for an explanation of the format.

1245 parts: list[ChoiceT] = []

1246 idx = 0

1247 while idx < len(buffer):

1248 tag = buffer[idx] >> 5

1249 size = buffer[idx] & 0b11111

1250 idx += 1

1251

1252 if tag == 0:

1253 parts.append(bool(size))

1254 continue

1255 if size == 0b11111:

1256 (offset, size) = _unpack_uleb128(buffer[idx:])

1257 idx += offset

1258 chunk = buffer[idx : idx + size]

1259 idx += size

1260

1261 if tag == 1:

1262 assert size == 8, "expected float64"

1263 parts.extend(struct.unpack("!d", chunk))

1264 elif tag == 2:

1265 parts.append(int.from_bytes(chunk, "big", signed=True))

1266 elif tag == 3:

1267 parts.append(chunk)

1268 else:

1269 assert tag == 4

1270 parts.append(chunk.decode(errors="surrogatepass"))

1271 return tuple(parts)

1272

1273

1274def choices_from_bytes(buffer: bytes, /) -> Optional[tuple[ChoiceT, ...]]:

1275 """

1276 Deserialize a bytestring to a tuple of choices. Inverts choices_to_bytes.

1277

1278 Returns None if the given bytestring is not a valid serialization of choice

1279 sequences.

1280 """

1281 try:

1282 return _choices_from_bytes(buffer)

1283 except Exception:

1284 # deserialization error, eg because our format changed or someone put junk

1285 # data in the db.

1286 return None