1# -*- coding: utf-8 -*-
2# Copyright (c) 2019 Ansible Project
3# (c) 2016 Toshio Kuratomi <tkuratomi@ansible.com>
4# Simplified BSD License (see licenses/simplified_bsd.txt or https://opensource.org/licenses/BSD-2-Clause)
5
6from __future__ import annotations
7
8import codecs
9import datetime
10import json
11
12from ansible.module_utils.six.moves.collections_abc import Set
13from ansible.module_utils.six import (
14 PY3,
15 binary_type,
16 iteritems,
17 text_type,
18)
19
20try:
21 codecs.lookup_error('surrogateescape')
22 HAS_SURROGATEESCAPE = True
23except LookupError:
24 HAS_SURROGATEESCAPE = False
25
26
27_COMPOSED_ERROR_HANDLERS = frozenset((None, 'surrogate_or_replace',
28 'surrogate_or_strict',
29 'surrogate_then_replace'))
30
31
32def to_bytes(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
33 """Make sure that a string is a byte string
34
35 :arg obj: An object to make sure is a byte string. In most cases this
36 will be either a text string or a byte string. However, with
37 ``nonstring='simplerepr'``, this can be used as a traceback-free
38 version of ``str(obj)``.
39 :kwarg encoding: The encoding to use to transform from a text string to
40 a byte string. Defaults to using 'utf-8'.
41 :kwarg errors: The error handler to use if the text string is not
42 encodable using the specified encoding. Any valid `codecs error
43 handler <https://docs.python.org/3/library/codecs.html#codec-base-classes>`_
44 may be specified. There are three additional error strategies
45 specifically aimed at helping people to port code. The first two are:
46
47 :surrogate_or_strict: Will use ``surrogateescape`` if it is a valid
48 handler, otherwise it will use ``strict``
49 :surrogate_or_replace: Will use ``surrogateescape`` if it is a valid
50 handler, otherwise it will use ``replace``.
51
52 Because ``surrogateescape`` was added in Python3 this usually means that
53 Python3 will use ``surrogateescape`` and Python2 will use the fallback
54 error handler. Note that the code checks for ``surrogateescape`` when the
55 module is imported. If you have a backport of ``surrogateescape`` for
56 Python2, be sure to register the error handler prior to importing this
57 module.
58
59 The last error handler is:
60
61 :surrogate_then_replace: Will use ``surrogateescape`` if it is a valid
62 handler. If encoding with ``surrogateescape`` would traceback,
63 surrogates are first replaced with a replacement characters
64 and then the string is encoded using ``replace`` (which replaces
65 the rest of the nonencodable bytes). If ``surrogateescape`` is
66 not present it will simply use ``replace``. (Added in Ansible 2.3)
67 This strategy is designed to never traceback when it attempts
68 to encode a string.
69
70 The default until Ansible-2.2 was ``surrogate_or_replace``
71 From Ansible-2.3 onwards, the default is ``surrogate_then_replace``.
72
73 :kwarg nonstring: The strategy to use if a nonstring is specified in
74 ``obj``. Default is 'simplerepr'. Valid values are:
75
76 :simplerepr: The default. This takes the ``str`` of the object and
77 then returns the bytes version of that string.
78 :empty: Return an empty byte string
79 :passthru: Return the object passed in
80 :strict: Raise a :exc:`TypeError`
81
82 :returns: Typically this returns a byte string. If a nonstring object is
83 passed in this may be a different type depending on the strategy
84 specified by nonstring. This will never return a text string.
85
86 .. note:: If passed a byte string, this function does not check that the
87 string is valid in the specified encoding. If it's important that the
88 byte string is in the specified encoding do::
89
90 encoded_string = to_bytes(to_text(input_string, 'latin-1'), 'utf-8')
91
92 .. version_changed:: 2.3
93
94 Added the ``surrogate_then_replace`` error handler and made it the default error handler.
95 """
96 if isinstance(obj, binary_type):
97 return obj
98
99 # We're given a text string
100 # If it has surrogates, we know because it will decode
101 original_errors = errors
102 if errors in _COMPOSED_ERROR_HANDLERS:
103 if HAS_SURROGATEESCAPE:
104 errors = 'surrogateescape'
105 elif errors == 'surrogate_or_strict':
106 errors = 'strict'
107 else:
108 errors = 'replace'
109
110 if isinstance(obj, text_type):
111 try:
112 # Try this first as it's the fastest
113 return obj.encode(encoding, errors)
114 except UnicodeEncodeError:
115 if original_errors in (None, 'surrogate_then_replace'):
116 # We should only reach this if encoding was non-utf8 original_errors was
117 # surrogate_then_escape and errors was surrogateescape
118
119 # Slow but works
120 return_string = obj.encode('utf-8', 'surrogateescape')
121 return_string = return_string.decode('utf-8', 'replace')
122 return return_string.encode(encoding, 'replace')
123 raise
124
125 # Note: We do these last even though we have to call to_bytes again on the
126 # value because we're optimizing the common case
127 if nonstring == 'simplerepr':
128 try:
129 value = str(obj)
130 except UnicodeError:
131 try:
132 value = repr(obj)
133 except UnicodeError:
134 # Giving up
135 return to_bytes('')
136 elif nonstring == 'passthru':
137 return obj
138 elif nonstring == 'empty':
139 # python2.4 doesn't have b''
140 return to_bytes('')
141 elif nonstring == 'strict':
142 raise TypeError('obj must be a string type')
143 else:
144 raise TypeError('Invalid value %s for to_bytes\' nonstring parameter' % nonstring)
145
146 return to_bytes(value, encoding, errors)
147
148
149def to_text(obj, encoding='utf-8', errors=None, nonstring='simplerepr'):
150 """Make sure that a string is a text string
151
152 :arg obj: An object to make sure is a text string. In most cases this
153 will be either a text string or a byte string. However, with
154 ``nonstring='simplerepr'``, this can be used as a traceback-free
155 version of ``str(obj)``.
156 :kwarg encoding: The encoding to use to transform from a byte string to
157 a text string. Defaults to using 'utf-8'.
158 :kwarg errors: The error handler to use if the byte string is not
159 decodable using the specified encoding. Any valid `codecs error
160 handler <https://docs.python.org/3/library/codecs.html#codec-base-classes>`_
161 may be specified. We support three additional error strategies
162 specifically aimed at helping people to port code:
163
164 :surrogate_or_strict: Will use surrogateescape if it is a valid
165 handler, otherwise it will use strict
166 :surrogate_or_replace: Will use surrogateescape if it is a valid
167 handler, otherwise it will use replace.
168 :surrogate_then_replace: Does the same as surrogate_or_replace but
169 `was added for symmetry with the error handlers in
170 :func:`ansible.module_utils.common.text.converters.to_bytes` (Added in Ansible 2.3)
171
172 Because surrogateescape was added in Python3 this usually means that
173 Python3 will use `surrogateescape` and Python2 will use the fallback
174 error handler. Note that the code checks for surrogateescape when the
175 module is imported. If you have a backport of `surrogateescape` for
176 python2, be sure to register the error handler prior to importing this
177 module.
178
179 The default until Ansible-2.2 was `surrogate_or_replace`
180 In Ansible-2.3 this defaults to `surrogate_then_replace` for symmetry
181 with :func:`ansible.module_utils.common.text.converters.to_bytes` .
182 :kwarg nonstring: The strategy to use if a nonstring is specified in
183 ``obj``. Default is 'simplerepr'. Valid values are:
184
185 :simplerepr: The default. This takes the ``str`` of the object and
186 then returns the text version of that string.
187 :empty: Return an empty text string
188 :passthru: Return the object passed in
189 :strict: Raise a :exc:`TypeError`
190
191 :returns: Typically this returns a text string. If a nonstring object is
192 passed in this may be a different type depending on the strategy
193 specified by nonstring. This will never return a byte string.
194 From Ansible-2.3 onwards, the default is `surrogate_then_replace`.
195
196 .. version_changed:: 2.3
197
198 Added the surrogate_then_replace error handler and made it the default error handler.
199 """
200 if isinstance(obj, text_type):
201 return obj
202
203 if errors in _COMPOSED_ERROR_HANDLERS:
204 if HAS_SURROGATEESCAPE:
205 errors = 'surrogateescape'
206 elif errors == 'surrogate_or_strict':
207 errors = 'strict'
208 else:
209 errors = 'replace'
210
211 if isinstance(obj, binary_type):
212 # Note: We don't need special handling for surrogate_then_replace
213 # because all bytes will either be made into surrogates or are valid
214 # to decode.
215 return obj.decode(encoding, errors)
216
217 # Note: We do these last even though we have to call to_text again on the
218 # value because we're optimizing the common case
219 if nonstring == 'simplerepr':
220 try:
221 value = str(obj)
222 except UnicodeError:
223 try:
224 value = repr(obj)
225 except UnicodeError:
226 # Giving up
227 return u''
228 elif nonstring == 'passthru':
229 return obj
230 elif nonstring == 'empty':
231 return u''
232 elif nonstring == 'strict':
233 raise TypeError('obj must be a string type')
234 else:
235 raise TypeError('Invalid value %s for to_text\'s nonstring parameter' % nonstring)
236
237 return to_text(value, encoding, errors)
238
239
240#: :py:func:`to_native`
241#: Transform a variable into the native str type for the python version
242#:
243#: On Python2, this is an alias for
244#: :func:`~ansible.module_utils.to_bytes`. On Python3 it is an alias for
245#: :func:`~ansible.module_utils.to_text`. It makes it easier to
246#: transform a variable into the native str type for the python version
247#: the code is running on. Use this when constructing the message to
248#: send to exceptions or when dealing with an API that needs to take
249#: a native string. Example::
250#:
251#: try:
252#: 1//0
253#: except ZeroDivisionError as e:
254#: raise MyException('Encountered and error: %s' % to_native(e))
255if PY3:
256 to_native = to_text
257else:
258 to_native = to_bytes
259
260
261def _json_encode_fallback(obj):
262 if isinstance(obj, Set):
263 return list(obj)
264 elif isinstance(obj, datetime.datetime):
265 return obj.isoformat()
266 raise TypeError("Cannot json serialize %s" % to_native(obj))
267
268
269def jsonify(data, **kwargs):
270 # After 2.18, we should remove this loop, and hardcode to utf-8 in alignment with requiring utf-8 module responses
271 for encoding in ("utf-8", "latin-1"):
272 try:
273 new_data = container_to_text(data, encoding=encoding)
274 except UnicodeDecodeError:
275 continue
276 return json.dumps(new_data, default=_json_encode_fallback, **kwargs)
277 raise UnicodeError('Invalid unicode encoding encountered')
278
279
280def container_to_bytes(d, encoding='utf-8', errors='surrogate_or_strict'):
281 ''' Recursively convert dict keys and values to byte str
282
283 Specialized for json return because this only handles, lists, tuples,
284 and dict container types (the containers that the json module returns)
285 '''
286
287 if isinstance(d, text_type):
288 return to_bytes(d, encoding=encoding, errors=errors)
289 elif isinstance(d, dict):
290 return dict(container_to_bytes(o, encoding, errors) for o in iteritems(d))
291 elif isinstance(d, list):
292 return [container_to_bytes(o, encoding, errors) for o in d]
293 elif isinstance(d, tuple):
294 return tuple(container_to_bytes(o, encoding, errors) for o in d)
295 else:
296 return d
297
298
299def container_to_text(d, encoding='utf-8', errors='surrogate_or_strict'):
300 """Recursively convert dict keys and values to text str
301
302 Specialized for json return because this only handles, lists, tuples,
303 and dict container types (the containers that the json module returns)
304 """
305
306 if isinstance(d, binary_type):
307 # Warning, can traceback
308 return to_text(d, encoding=encoding, errors=errors)
309 elif isinstance(d, dict):
310 return dict(container_to_text(o, encoding, errors) for o in iteritems(d))
311 elif isinstance(d, list):
312 return [container_to_text(o, encoding, errors) for o in d]
313 elif isinstance(d, tuple):
314 return tuple(container_to_text(o, encoding, errors) for o in d)
315 else:
316 return d