1# dialects/oracle/vector.py
2# Copyright (C) 2005-2025 the SQLAlchemy authors and contributors
3# <see AUTHORS file>
4#
5# This module is part of SQLAlchemy and is released under
6# the MIT License: https://www.opensource.org/licenses/mit-license.php
7# mypy: ignore-errors
8
9
10from __future__ import annotations
11
12import array
13from dataclasses import dataclass
14from enum import Enum
15from typing import Optional
16from typing import Union
17
18from ... import types
19from ...types import Float
20
21
22class VectorIndexType(Enum):
23 """Enum representing different types of VECTOR index structures.
24
25 See :ref:`oracle_vector_datatype` for background.
26
27 .. versionadded:: 2.0.41
28
29 """
30
31 HNSW = "HNSW"
32 """
33 The HNSW (Hierarchical Navigable Small World) index type.
34 """
35 IVF = "IVF"
36 """
37 The IVF (Inverted File Index) index type
38 """
39
40
41class VectorDistanceType(Enum):
42 """Enum representing different types of vector distance metrics.
43
44 See :ref:`oracle_vector_datatype` for background.
45
46 .. versionadded:: 2.0.41
47
48 """
49
50 EUCLIDEAN = "EUCLIDEAN"
51 """Euclidean distance (L2 norm).
52
53 Measures the straight-line distance between two vectors in space.
54 """
55 DOT = "DOT"
56 """Dot product similarity.
57
58 Measures the algebraic similarity between two vectors.
59 """
60 COSINE = "COSINE"
61 """Cosine similarity.
62
63 Measures the cosine of the angle between two vectors.
64 """
65 MANHATTAN = "MANHATTAN"
66 """Manhattan distance (L1 norm).
67
68 Calculates the sum of absolute differences across dimensions.
69 """
70
71
72class VectorStorageFormat(Enum):
73 """Enum representing the data format used to store vector components.
74
75 See :ref:`oracle_vector_datatype` for background.
76
77 .. versionadded:: 2.0.41
78
79 """
80
81 INT8 = "INT8"
82 """
83 8-bit integer format.
84 """
85 BINARY = "BINARY"
86 """
87 Binary format.
88 """
89 FLOAT32 = "FLOAT32"
90 """
91 32-bit floating-point format.
92 """
93 FLOAT64 = "FLOAT64"
94 """
95 64-bit floating-point format.
96 """
97
98
99class VectorStorageType(Enum):
100 """Enum representing the vector type,
101
102 See :ref:`oracle_vector_datatype` for background.
103
104 .. versionadded:: 2.0.43
105
106 """
107
108 SPARSE = "SPARSE"
109 """
110 A Sparse vector is a vector which has zero value for
111 most of its dimensions.
112 """
113 DENSE = "DENSE"
114 """
115 A Dense vector is a vector where most, if not all, elements
116 hold meaningful values.
117 """
118
119
120@dataclass
121class VectorIndexConfig:
122 """Define the configuration for Oracle VECTOR Index.
123
124 See :ref:`oracle_vector_datatype` for background.
125
126 .. versionadded:: 2.0.41
127
128 :param index_type: Enum value from :class:`.VectorIndexType`
129 Specifies the indexing method. For HNSW, this must be
130 :attr:`.VectorIndexType.HNSW`.
131
132 :param distance: Enum value from :class:`.VectorDistanceType`
133 specifies the metric for calculating distance between VECTORS.
134
135 :param accuracy: interger. Should be in the range 0 to 100
136 Specifies the accuracy of the nearest neighbor search during
137 query execution.
138
139 :param parallel: integer. Specifies degree of parallelism.
140
141 :param hnsw_neighbors: interger. Should be in the range 0 to
142 2048. Specifies the number of nearest neighbors considered
143 during the search. The attribute :attr:`.VectorIndexConfig.hnsw_neighbors`
144 is HNSW index specific.
145
146 :param hnsw_efconstruction: integer. Should be in the range 0
147 to 65535. Controls the trade-off between indexing speed and
148 recall quality during index construction. The attribute
149 :attr:`.VectorIndexConfig.hnsw_efconstruction` is HNSW index
150 specific.
151
152 :param ivf_neighbor_partitions: integer. Should be in the range
153 0 to 10,000,000. Specifies the number of partitions used to
154 divide the dataset. The attribute
155 :attr:`.VectorIndexConfig.ivf_neighbor_partitions` is IVF index
156 specific.
157
158 :param ivf_sample_per_partition: integer. Should be between 1
159 and ``num_vectors / neighbor partitions``. Specifies the
160 number of samples used per partition. The attribute
161 :attr:`.VectorIndexConfig.ivf_sample_per_partition` is IVF index
162 specific.
163
164 :param ivf_min_vectors_per_partition: integer. From 0 (no trimming)
165 to the total number of vectors (results in 1 partition). Specifies
166 the minimum number of vectors per partition. The attribute
167 :attr:`.VectorIndexConfig.ivf_min_vectors_per_partition`
168 is IVF index specific.
169
170 """
171
172 index_type: VectorIndexType = VectorIndexType.HNSW
173 distance: Optional[VectorDistanceType] = None
174 accuracy: Optional[int] = None
175 hnsw_neighbors: Optional[int] = None
176 hnsw_efconstruction: Optional[int] = None
177 ivf_neighbor_partitions: Optional[int] = None
178 ivf_sample_per_partition: Optional[int] = None
179 ivf_min_vectors_per_partition: Optional[int] = None
180 parallel: Optional[int] = None
181
182 def __post_init__(self):
183 self.index_type = VectorIndexType(self.index_type)
184 for field in [
185 "hnsw_neighbors",
186 "hnsw_efconstruction",
187 "ivf_neighbor_partitions",
188 "ivf_sample_per_partition",
189 "ivf_min_vectors_per_partition",
190 "parallel",
191 "accuracy",
192 ]:
193 value = getattr(self, field)
194 if value is not None and not isinstance(value, int):
195 raise TypeError(
196 f"{field} must be an integer if"
197 f"provided, got {type(value).__name__}"
198 )
199
200
201class SparseVector:
202 """
203 Lightweight SQLAlchemy-side version of SparseVector.
204 This mimics oracledb.SparseVector.
205
206 .. versionadded:: 2.0.43
207
208 """
209
210 def __init__(
211 self,
212 num_dimensions: int,
213 indices: Union[list, array.array],
214 values: Union[list, array.array],
215 ):
216 if not isinstance(indices, array.array) or indices.typecode != "I":
217 indices = array.array("I", indices)
218 if not isinstance(values, array.array):
219 values = array.array("d", values)
220 if len(indices) != len(values):
221 raise TypeError("indices and values must be of the same length!")
222
223 self.num_dimensions = num_dimensions
224 self.indices = indices
225 self.values = values
226
227 def __str__(self):
228 return (
229 f"SparseVector(num_dimensions={self.num_dimensions}, "
230 f"size={len(self.indices)}, typecode={self.values.typecode})"
231 )
232
233
234class VECTOR(types.TypeEngine):
235 """Oracle VECTOR datatype.
236
237 For complete background on using this type, see
238 :ref:`oracle_vector_datatype`.
239
240 .. versionadded:: 2.0.41
241
242 """
243
244 cache_ok = True
245
246 __visit_name__ = "VECTOR"
247
248 _typecode_map = {
249 VectorStorageFormat.INT8: "b", # Signed int
250 VectorStorageFormat.BINARY: "B", # Unsigned int
251 VectorStorageFormat.FLOAT32: "f", # Float
252 VectorStorageFormat.FLOAT64: "d", # Double
253 }
254
255 def __init__(self, dim=None, storage_format=None, storage_type=None):
256 """Construct a VECTOR.
257
258 :param dim: integer. The dimension of the VECTOR datatype. This
259 should be an integer value.
260
261 :param storage_format: VectorStorageFormat. The VECTOR storage
262 type format. This should be Enum values form
263 :class:`.VectorStorageFormat` INT8, BINARY, FLOAT32, or FLOAT64.
264
265 :param storage_type: VectorStorageType. The Vector storage type. This
266 should be Enum values from :class:`.VectorStorageType` SPARSE or
267 DENSE.
268
269 """
270
271 if dim is not None and not isinstance(dim, int):
272 raise TypeError("dim must be an interger")
273 if storage_format is not None and not isinstance(
274 storage_format, VectorStorageFormat
275 ):
276 raise TypeError(
277 "storage_format must be an enum of type VectorStorageFormat"
278 )
279 if storage_type is not None and not isinstance(
280 storage_type, VectorStorageType
281 ):
282 raise TypeError(
283 "storage_type must be an enum of type VectorStorageType"
284 )
285
286 self.dim = dim
287 self.storage_format = storage_format
288 self.storage_type = storage_type
289
290 def _cached_bind_processor(self, dialect):
291 """
292 Converts a Python-side SparseVector instance into an
293 oracledb.SparseVectormor a compatible array format before
294 binding it to the database.
295 """
296
297 def process(value):
298 if value is None or isinstance(value, array.array):
299 return value
300
301 # Convert list to a array.array
302 elif isinstance(value, list):
303 typecode = self._array_typecode(self.storage_format)
304 value = array.array(typecode, value)
305 return value
306
307 # Convert SqlAlchemy SparseVector to oracledb SparseVector object
308 elif isinstance(value, SparseVector):
309 return dialect.dbapi.SparseVector(
310 value.num_dimensions,
311 value.indices,
312 value.values,
313 )
314
315 else:
316 raise TypeError(
317 """
318 Invalid input for VECTOR: expected a list, an array.array,
319 or a SparseVector object.
320 """
321 )
322
323 return process
324
325 def _cached_result_processor(self, dialect, coltype):
326 """
327 Converts database-returned values into Python-native representations.
328 If the value is an oracledb.SparseVector, it is converted into the
329 SQLAlchemy-side SparseVector class.
330 If the value is a array.array, it is converted to a plain Python list.
331
332 """
333
334 def process(value):
335 if value is None:
336 return None
337
338 elif isinstance(value, array.array):
339 return list(value)
340
341 # Convert Oracledb SparseVector to SqlAlchemy SparseVector object
342 elif isinstance(value, dialect.dbapi.SparseVector):
343 return SparseVector(
344 num_dimensions=value.num_dimensions,
345 indices=value.indices,
346 values=value.values,
347 )
348
349 return process
350
351 def _array_typecode(self, typecode):
352 """
353 Map storage format to array typecode.
354 """
355 return self._typecode_map.get(typecode, "d")
356
357 class comparator_factory(types.TypeEngine.Comparator):
358 def l2_distance(self, other):
359 return self.op("<->", return_type=Float)(other)
360
361 def inner_product(self, other):
362 return self.op("<#>", return_type=Float)(other)
363
364 def cosine_distance(self, other):
365 return self.op("<=>", return_type=Float)(other)