1"""parsing and generation of content lines"""
2
3import re
4
5from icalendar.parser.parameter import Parameters
6from icalendar.parser.property import unescape_backslash, unescape_list_or_string
7from icalendar.parser.string import (
8 escape_string,
9 foldline,
10 unescape_string,
11 validate_token,
12)
13from icalendar.parser_tools import DEFAULT_ENCODING, ICAL_TYPE, to_unicode
14
15UFOLD = re.compile("(\r?\n)+[ \t]")
16NEWLINE = re.compile(r"\r?\n")
17
18OWS = " \t"
19OWS_AROUND_DELIMITERS_RE = re.compile(r"[ \t]*([;=])[ \t]*")
20
21
22def _strip_ows_around_delimiters(st: str, delimiters: str = ";=") -> str:
23 """Strip optional whitespace around delimiters outside of quoted sections,
24 respecting backslash escapes so that escaped delimiters are not treated as
25 separators.
26
27 This is a lenient parsing helper (used when strict=False) to support
28 iCalendar content lines that contain extra whitespace around tokens.
29 """
30 if not st:
31 return st
32
33 # Fast path for the common case in non-strict mode:
34 # no whitespace in the parameter section means there is nothing to normalize.
35 if " " not in st and "\t" not in st:
36 return st
37
38 # Fast regex-based path for simple parameter sections without quoting/escaping.
39 if delimiters == ";=" and '"' not in st and "\\" not in st:
40 return OWS_AROUND_DELIMITERS_RE.sub(r"\1", st).strip()
41
42 out: list[str] = []
43 pending_ws: list[str] = []
44 in_quotes = False
45 escaped = False
46 # True only if the last appended char was a raw delimiter.
47 last_was_delimiter = False
48
49 def flush_pending() -> None:
50 nonlocal pending_ws
51 if not pending_ws:
52 return
53 if not last_was_delimiter:
54 out.extend(pending_ws)
55 pending_ws.clear()
56
57 for ch in st:
58 # Handle escaped character (the backslash set escaped in previous iteration)
59 if escaped:
60 flush_pending()
61 out.append(ch)
62 escaped = False
63 last_was_delimiter = False
64 continue
65
66 # Handle backslash to escape next character
67 if ch == "\\" and not in_quotes:
68 flush_pending()
69 out.append(ch)
70 escaped = True
71 last_was_delimiter = False
72 continue
73
74 # Handle quote toggling
75 if ch == '"' and not escaped:
76 in_quotes = not in_quotes
77 flush_pending()
78 out.append(ch)
79 last_was_delimiter = False
80 continue
81
82 # Whitespace outside quotes is buffered
83 if not in_quotes and not escaped and ch in OWS:
84 pending_ws.append(ch)
85 continue
86
87 # Raw delimiter (unescaped and outside quotes)
88 if not in_quotes and not escaped and ch in delimiters:
89 pending_ws.clear()
90 while out and out[-1] in OWS:
91 out.pop()
92 out.append(ch)
93 last_was_delimiter = True
94 continue
95
96 # Regular character
97 flush_pending()
98 out.append(ch)
99 last_was_delimiter = False
100
101 if pending_ws and not last_was_delimiter:
102 out.extend(pending_ws)
103
104 return "".join(out).strip()
105
106
107class Contentline(str):
108 """A content line is basically a string that can be folded and parsed into
109 parts.
110 """
111
112 __slots__ = ("strict",)
113
114 def __new__(cls, value, strict=False, encoding=DEFAULT_ENCODING):
115 value = to_unicode(value, encoding=encoding)
116 assert "\n" not in value, (
117 "Content line can not contain unescaped new line characters."
118 )
119 self = super().__new__(cls, value)
120 self.strict = strict
121 return self
122
123 @classmethod
124 def from_parts(
125 cls,
126 name: ICAL_TYPE,
127 params: Parameters,
128 values,
129 sorted: bool = True, # noqa: A002
130 ):
131 """Turn a parts into a content line."""
132 assert isinstance(params, Parameters)
133 if hasattr(values, "to_ical"):
134 values = values.to_ical()
135 else:
136 from icalendar.prop import vText
137
138 values = vText(values).to_ical()
139 # elif isinstance(values, basestring):
140 # values = escape_char(values)
141
142 # TODO: after unicode only, remove this
143 # Convert back to unicode, after to_ical encoded it.
144 name = to_unicode(name)
145 values = to_unicode(values)
146 if params:
147 params = to_unicode(params.to_ical(sorted=sorted))
148 if params:
149 # some parameter values can be skipped during serialization
150 return cls(f"{name};{params}:{values}")
151 return cls(f"{name}:{values}")
152
153 def parts(self) -> tuple[str, Parameters, str]:
154 """Split the content line into ``name``, ``parameters``, and ``values`` parts.
155
156 Properly handles escaping with backslashes and double-quote sections
157 to avoid corrupting URL-encoded characters in values.
158
159 Example with parameter:
160
161 .. code-block:: ics
162
163 DESCRIPTION;ALTREP="cid:part1.0001@example.org":The Fall'98 Wild
164
165 Example without parameters:
166
167 .. code-block:: ics
168
169 DESCRIPTION:The Fall'98 Wild
170 """
171 try:
172 name_split: int | None = None
173 value_split: int | None = None
174 in_quotes: bool = False
175 escaped: bool = False
176
177 for i, ch in enumerate(self):
178 if ch == '"' and not escaped:
179 in_quotes = not in_quotes
180 elif ch == "\\" and not in_quotes:
181 escaped = True
182 continue
183 elif not in_quotes and not escaped:
184 # Find first delimiter for name
185 if ch in ":;" and name_split is None:
186 name_split = i
187 # Find value delimiter (first colon)
188 if ch == ":" and value_split is None:
189 value_split = i
190
191 escaped = False
192
193 # Validate parsing results
194 if not value_split:
195 # No colon found - value is empty, use end of string
196 value_split = len(self)
197
198 # Extract name - if no delimiter,
199 # take whole string for validate_token to reject
200 name = self[:name_split] if name_split else self
201 if not self.strict:
202 name = re.sub(r"[ \t]+", "", name.strip())
203 validate_token(name)
204
205 if not name_split or name_split + 1 == value_split:
206 # No delimiter or empty parameter section
207 raise ValueError("Invalid content line") # noqa: TRY301
208 # Parse parameters - they still need to be escaped/unescaped
209 # for proper handling of commas, semicolons, etc. in parameter values
210 raw_param_str = self[name_split + 1 : value_split]
211 if not self.strict:
212 raw_param_str = _strip_ows_around_delimiters(raw_param_str)
213 param_str = escape_string(raw_param_str)
214 params = Parameters.from_ical(param_str, strict=self.strict)
215 params = Parameters(
216 (unescape_string(key), unescape_list_or_string(value))
217 for key, value in iter(params.items())
218 )
219 # Unescape backslash sequences in values but preserve URL encoding
220 values = unescape_backslash(self[value_split + 1 :])
221 except ValueError as exc:
222 raise ValueError(
223 f"Content line could not be parsed into parts: '{self}': {exc}"
224 ) from exc
225 return (name, params, values)
226
227 @classmethod
228 def from_ical(cls, ical, strict=False):
229 """Unfold the content lines in an iCalendar into long content lines."""
230 ical = to_unicode(ical)
231 # a fold is carriage return followed by either a space or a tab
232 return cls(UFOLD.sub("", ical), strict=strict)
233
234 def to_ical(self):
235 """Long content lines are folded so they are less than 75 characters
236 wide.
237 """
238 return foldline(self).encode(DEFAULT_ENCODING)
239
240
241class Contentlines(list[Contentline]):
242 """I assume that iCalendar files generally are a few kilobytes in size.
243 Then this should be efficient. for Huge files, an iterator should probably
244 be used instead.
245 """
246
247 def to_ical(self):
248 """Simply join self."""
249 return b"\r\n".join(line.to_ical() for line in self if line) + b"\r\n"
250
251 @classmethod
252 def from_ical(cls, st):
253 """Parses a string into content lines."""
254 st = to_unicode(st)
255 try:
256 # a fold is carriage return followed by either a space or a tab
257 unfolded = UFOLD.sub("", st)
258 lines = cls(Contentline(line) for line in NEWLINE.split(unfolded) if line)
259 lines.append("") # '\r\n' at the end of every content line
260 except Exception as e:
261 raise ValueError("Expected StringType with content lines") from e
262 return lines
263
264
265__all__ = ["Contentline", "Contentlines"]