1# dialects/oracle/vector.py
2# Copyright (C) 2005-2025 the SQLAlchemy authors and contributors
3# <see AUTHORS file>
4#
5# This module is part of SQLAlchemy and is released under
6# the MIT License: https://www.opensource.org/licenses/mit-license.php
7# mypy: ignore-errors
8
9
10from __future__ import annotations
11
12import array
13from dataclasses import dataclass
14from enum import Enum
15from typing import Optional
16from typing import Union
17
18import sqlalchemy.types as types
19from sqlalchemy.types import Float
20
21
22class VectorIndexType(Enum):
23 """Enum representing different types of VECTOR index structures.
24
25 See :ref:`oracle_vector_datatype` for background.
26
27 .. versionadded:: 2.0.41
28
29 """
30
31 HNSW = "HNSW"
32 """
33 The HNSW (Hierarchical Navigable Small World) index type.
34 """
35 IVF = "IVF"
36 """
37 The IVF (Inverted File Index) index type
38 """
39
40
41class VectorDistanceType(Enum):
42 """Enum representing different types of vector distance metrics.
43
44 See :ref:`oracle_vector_datatype` for background.
45
46 .. versionadded:: 2.0.41
47
48 """
49
50 EUCLIDEAN = "EUCLIDEAN"
51 """Euclidean distance (L2 norm).
52
53 Measures the straight-line distance between two vectors in space.
54 """
55 DOT = "DOT"
56 """Dot product similarity.
57
58 Measures the algebraic similarity between two vectors.
59 """
60 COSINE = "COSINE"
61 """Cosine similarity.
62
63 Measures the cosine of the angle between two vectors.
64 """
65 MANHATTAN = "MANHATTAN"
66 """Manhattan distance (L1 norm).
67
68 Calculates the sum of absolute differences across dimensions.
69 """
70
71
72class VectorStorageFormat(Enum):
73 """Enum representing the data format used to store vector components.
74
75 See :ref:`oracle_vector_datatype` for background.
76
77 .. versionadded:: 2.0.41
78
79 """
80
81 INT8 = "INT8"
82 """
83 8-bit integer format.
84 """
85 BINARY = "BINARY"
86 """
87 Binary format.
88 """
89 FLOAT32 = "FLOAT32"
90 """
91 32-bit floating-point format.
92 """
93 FLOAT64 = "FLOAT64"
94 """
95 64-bit floating-point format.
96 """
97
98
99class VectorStorageType(Enum):
100 """Enum representing the vector type,
101
102 See :ref:`oracle_vector_datatype` for background.
103
104 .. versionadded:: 2.0.43
105
106 """
107
108 SPARSE = "SPARSE"
109 """
110 A Sparse vector is a vector which has zero value for
111 most of its dimensions.
112 """
113 DENSE = "DENSE"
114 """
115 A Dense vector is a vector where most, if not all, elements
116 hold meaningful values.
117 """
118
119
120@dataclass
121class VectorIndexConfig:
122 """Define the configuration for Oracle VECTOR Index.
123
124 See :ref:`oracle_vector_datatype` for background.
125
126 .. versionadded:: 2.0.41
127
128 :param index_type: Enum value from :class:`.VectorIndexType`
129 Specifies the indexing method. For HNSW, this must be
130 :attr:`.VectorIndexType.HNSW`.
131
132 :param distance: Enum value from :class:`.VectorDistanceType`
133 specifies the metric for calculating distance between VECTORS.
134
135 :param accuracy: interger. Should be in the range 0 to 100
136 Specifies the accuracy of the nearest neighbor search during
137 query execution.
138
139 :param parallel: integer. Specifies degree of parallelism.
140
141 :param hnsw_neighbors: interger. Should be in the range 0 to
142 2048. Specifies the number of nearest neighbors considered
143 during the search. The attribute :attr:`.VectorIndexConfig.hnsw_neighbors`
144 is HNSW index specific.
145
146 :param hnsw_efconstruction: integer. Should be in the range 0
147 to 65535. Controls the trade-off between indexing speed and
148 recall quality during index construction. The attribute
149 :attr:`.VectorIndexConfig.hnsw_efconstruction` is HNSW index
150 specific.
151
152 :param ivf_neighbor_partitions: integer. Should be in the range
153 0 to 10,000,000. Specifies the number of partitions used to
154 divide the dataset. The attribute
155 :attr:`.VectorIndexConfig.ivf_neighbor_partitions` is IVF index
156 specific.
157
158 :param ivf_sample_per_partition: integer. Should be between 1
159 and ``num_vectors / neighbor partitions``. Specifies the
160 number of samples used per partition. The attribute
161 :attr:`.VectorIndexConfig.ivf_sample_per_partition` is IVF index
162 specific.
163
164 :param ivf_min_vectors_per_partition: integer. From 0 (no trimming)
165 to the total number of vectors (results in 1 partition). Specifies
166 the minimum number of vectors per partition. The attribute
167 :attr:`.VectorIndexConfig.ivf_min_vectors_per_partition`
168 is IVF index specific.
169
170 """
171
172 index_type: VectorIndexType = VectorIndexType.HNSW
173 distance: Optional[VectorDistanceType] = None
174 accuracy: Optional[int] = None
175 hnsw_neighbors: Optional[int] = None
176 hnsw_efconstruction: Optional[int] = None
177 ivf_neighbor_partitions: Optional[int] = None
178 ivf_sample_per_partition: Optional[int] = None
179 ivf_min_vectors_per_partition: Optional[int] = None
180 parallel: Optional[int] = None
181
182 def __post_init__(self):
183 self.index_type = VectorIndexType(self.index_type)
184 for field in [
185 "hnsw_neighbors",
186 "hnsw_efconstruction",
187 "ivf_neighbor_partitions",
188 "ivf_sample_per_partition",
189 "ivf_min_vectors_per_partition",
190 "parallel",
191 "accuracy",
192 ]:
193 value = getattr(self, field)
194 if value is not None and not isinstance(value, int):
195 raise TypeError(
196 f"{field} must be an integer if"
197 f"provided, got {type(value).__name__}"
198 )
199
200
201class SparseVector:
202 """
203 Lightweight SQLAlchemy-side version of SparseVector.
204 This mimics oracledb.SparseVector.
205
206 .. versionadded:: 2.0.43
207
208 """
209
210 def __init__(
211 self,
212 num_dimensions: int,
213 indices: Union[list, array.array],
214 values: Union[list, array.array],
215 ):
216 if not isinstance(indices, array.array) or indices.typecode != "I":
217 indices = array.array("I", indices)
218 if not isinstance(values, array.array):
219 values = array.array("d", values)
220 if len(indices) != len(values):
221 raise TypeError("indices and values must be of the same length!")
222
223 self.num_dimensions = num_dimensions
224 self.indices = indices
225 self.values = values
226
227 def __str__(self):
228 return (
229 f"SparseVector(num_dimensions={self.num_dimensions}, "
230 f"size={len(self.indices)}, typecode={self.values.typecode})"
231 )
232
233
234class VECTOR(types.TypeEngine):
235 """Oracle VECTOR datatype.
236
237 For complete background on using this type, see
238 :ref:`oracle_vector_datatype`.
239
240 .. versionadded:: 2.0.41
241
242 """
243
244 cache_ok = True
245 __visit_name__ = "VECTOR"
246
247 _typecode_map = {
248 VectorStorageFormat.INT8: "b", # Signed int
249 VectorStorageFormat.BINARY: "B", # Unsigned int
250 VectorStorageFormat.FLOAT32: "f", # Float
251 VectorStorageFormat.FLOAT64: "d", # Double
252 }
253
254 def __init__(self, dim=None, storage_format=None, storage_type=None):
255 """Construct a VECTOR.
256
257 :param dim: integer. The dimension of the VECTOR datatype. This
258 should be an integer value.
259
260 :param storage_format: VectorStorageFormat. The VECTOR storage
261 type format. This should be Enum values form
262 :class:`.VectorStorageFormat` INT8, BINARY, FLOAT32, or FLOAT64.
263
264 :param storage_type: VectorStorageType. The Vector storage type. This
265 should be Enum values from :class:`.VectorStorageType` SPARSE or
266 DENSE.
267
268 """
269
270 if dim is not None and not isinstance(dim, int):
271 raise TypeError("dim must be an interger")
272 if storage_format is not None and not isinstance(
273 storage_format, VectorStorageFormat
274 ):
275 raise TypeError(
276 "storage_format must be an enum of type VectorStorageFormat"
277 )
278 if storage_type is not None and not isinstance(
279 storage_type, VectorStorageType
280 ):
281 raise TypeError(
282 "storage_type must be an enum of type VectorStorageType"
283 )
284
285 self.dim = dim
286 self.storage_format = storage_format
287 self.storage_type = storage_type
288
289 def _cached_bind_processor(self, dialect):
290 """
291 Converts a Python-side SparseVector instance into an
292 oracledb.SparseVectormor a compatible array format before
293 binding it to the database.
294 """
295
296 def process(value):
297 if value is None or isinstance(value, array.array):
298 return value
299
300 # Convert list to a array.array
301 elif isinstance(value, list):
302 typecode = self._array_typecode(self.storage_format)
303 value = array.array(typecode, value)
304 return value
305
306 # Convert SqlAlchemy SparseVector to oracledb SparseVector object
307 elif isinstance(value, SparseVector):
308 return dialect.dbapi.SparseVector(
309 value.num_dimensions,
310 value.indices,
311 value.values,
312 )
313
314 else:
315 raise TypeError(
316 """
317 Invalid input for VECTOR: expected a list, an array.array,
318 or a SparseVector object.
319 """
320 )
321
322 return process
323
324 def _cached_result_processor(self, dialect, coltype):
325 """
326 Converts database-returned values into Python-native representations.
327 If the value is an oracledb.SparseVector, it is converted into the
328 SQLAlchemy-side SparseVector class.
329 If the value is a array.array, it is converted to a plain Python list.
330
331 """
332
333 def process(value):
334 if value is None:
335 return None
336
337 elif isinstance(value, array.array):
338 return list(value)
339
340 # Convert Oracledb SparseVector to SqlAlchemy SparseVector object
341 elif isinstance(value, dialect.dbapi.SparseVector):
342 return SparseVector(
343 num_dimensions=value.num_dimensions,
344 indices=value.indices,
345 values=value.values,
346 )
347
348 return process
349
350 def _array_typecode(self, typecode):
351 """
352 Map storage format to array typecode.
353 """
354 return self._typecode_map.get(typecode, "d")
355
356 class comparator_factory(types.TypeEngine.Comparator):
357 def l2_distance(self, other):
358 return self.op("<->", return_type=Float)(other)
359
360 def inner_product(self, other):
361 return self.op("<#>", return_type=Float)(other)
362
363 def cosine_distance(self, other):
364 return self.op("<=>", return_type=Float)(other)