/src/postgres/src/common/file_utils.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * File-processing utility routines. |
4 | | * |
5 | | * Assorted utility functions to work on files. |
6 | | * |
7 | | * |
8 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
9 | | * Portions Copyright (c) 1994, Regents of the University of California |
10 | | * |
11 | | * src/common/file_utils.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | |
16 | | #ifndef FRONTEND |
17 | | #include "postgres.h" |
18 | | #else |
19 | | #include "postgres_fe.h" |
20 | | #endif |
21 | | |
22 | | #include <dirent.h> |
23 | | #include <fcntl.h> |
24 | | #include <sys/stat.h> |
25 | | #include <unistd.h> |
26 | | |
27 | | #include "common/file_utils.h" |
28 | | #ifdef FRONTEND |
29 | | #include "common/logging.h" |
30 | | #endif |
31 | | #include "common/relpath.h" |
32 | | #include "port/pg_iovec.h" |
33 | | |
34 | | #ifdef FRONTEND |
35 | | |
36 | | /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ |
37 | | #if defined(HAVE_SYNC_FILE_RANGE) |
38 | | #define PG_FLUSH_DATA_WORKS 1 |
39 | | #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) |
40 | | #define PG_FLUSH_DATA_WORKS 1 |
41 | | #endif |
42 | | |
43 | | /* |
44 | | * pg_xlog has been renamed to pg_wal in version 10. |
45 | | */ |
46 | | #define MINIMUM_VERSION_FOR_PG_WAL 100000 |
47 | | |
48 | | static void walkdir(const char *path, |
49 | | int (*action) (const char *fname, bool isdir), |
50 | | bool process_symlinks, |
51 | | const char *exclude_dir); |
52 | | |
53 | | #ifdef HAVE_SYNCFS |
54 | | |
55 | | /* |
56 | | * do_syncfs -- Try to syncfs a file system |
57 | | * |
58 | | * Reports errors trying to open the path. syncfs() errors are fatal. |
59 | | */ |
60 | | static void |
61 | | do_syncfs(const char *path) |
62 | | { |
63 | | int fd; |
64 | | |
65 | | fd = open(path, O_RDONLY, 0); |
66 | | |
67 | | if (fd < 0) |
68 | | { |
69 | | pg_log_error("could not open file \"%s\": %m", path); |
70 | | return; |
71 | | } |
72 | | |
73 | | if (syncfs(fd) < 0) |
74 | | { |
75 | | pg_log_error("could not synchronize file system for file \"%s\": %m", path); |
76 | | (void) close(fd); |
77 | | exit(EXIT_FAILURE); |
78 | | } |
79 | | |
80 | | (void) close(fd); |
81 | | } |
82 | | |
83 | | #endif /* HAVE_SYNCFS */ |
84 | | |
85 | | /* |
86 | | * Synchronize PGDATA and all its contents. |
87 | | * |
88 | | * We sync regular files and directories wherever they are, but we follow |
89 | | * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc. |
90 | | * Other symlinks are presumed to point at files we're not responsible for |
91 | | * syncing, and might not have privileges to write at all. |
92 | | * |
93 | | * serverVersion indicates the version of the server to be sync'd. |
94 | | * |
95 | | * If sync_data_files is false, this function skips syncing "base/" and any |
96 | | * other tablespace directories. |
97 | | */ |
98 | | void |
99 | | sync_pgdata(const char *pg_data, |
100 | | int serverVersion, |
101 | | DataDirSyncMethod sync_method, |
102 | | bool sync_data_files) |
103 | | { |
104 | | bool xlog_is_symlink; |
105 | | char pg_wal[MAXPGPATH]; |
106 | | char pg_tblspc[MAXPGPATH]; |
107 | | |
108 | | /* handle renaming of pg_xlog to pg_wal in post-10 clusters */ |
109 | | snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data, |
110 | | serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal"); |
111 | | snprintf(pg_tblspc, MAXPGPATH, "%s/%s", pg_data, PG_TBLSPC_DIR); |
112 | | |
113 | | /* |
114 | | * If pg_wal is a symlink, we'll need to recurse into it separately, |
115 | | * because the first walkdir below will ignore it. |
116 | | */ |
117 | | xlog_is_symlink = false; |
118 | | |
119 | | { |
120 | | struct stat st; |
121 | | |
122 | | if (lstat(pg_wal, &st) < 0) |
123 | | pg_log_error("could not stat file \"%s\": %m", pg_wal); |
124 | | else if (S_ISLNK(st.st_mode)) |
125 | | xlog_is_symlink = true; |
126 | | } |
127 | | |
128 | | switch (sync_method) |
129 | | { |
130 | | case DATA_DIR_SYNC_METHOD_SYNCFS: |
131 | | { |
132 | | #ifndef HAVE_SYNCFS |
133 | | pg_log_error("this build does not support sync method \"%s\"", |
134 | | "syncfs"); |
135 | | exit(EXIT_FAILURE); |
136 | | #else |
137 | | DIR *dir; |
138 | | struct dirent *de; |
139 | | |
140 | | /* |
141 | | * On Linux, we don't have to open every single file one by |
142 | | * one. We can use syncfs() to sync whole filesystems. We |
143 | | * only expect filesystem boundaries to exist where we |
144 | | * tolerate symlinks, namely pg_wal and the tablespaces, so we |
145 | | * call syncfs() for each of those directories. |
146 | | */ |
147 | | |
148 | | /* Sync the top level pgdata directory. */ |
149 | | do_syncfs(pg_data); |
150 | | |
151 | | /* If any tablespaces are configured, sync each of those. */ |
152 | | if (sync_data_files) |
153 | | { |
154 | | dir = opendir(pg_tblspc); |
155 | | if (dir == NULL) |
156 | | pg_log_error("could not open directory \"%s\": %m", |
157 | | pg_tblspc); |
158 | | else |
159 | | { |
160 | | while (errno = 0, (de = readdir(dir)) != NULL) |
161 | | { |
162 | | char subpath[MAXPGPATH * 2]; |
163 | | |
164 | | if (strcmp(de->d_name, ".") == 0 || |
165 | | strcmp(de->d_name, "..") == 0) |
166 | | continue; |
167 | | |
168 | | snprintf(subpath, sizeof(subpath), "%s/%s", |
169 | | pg_tblspc, de->d_name); |
170 | | do_syncfs(subpath); |
171 | | } |
172 | | |
173 | | if (errno) |
174 | | pg_log_error("could not read directory \"%s\": %m", |
175 | | pg_tblspc); |
176 | | |
177 | | (void) closedir(dir); |
178 | | } |
179 | | } |
180 | | |
181 | | /* If pg_wal is a symlink, process that too. */ |
182 | | if (xlog_is_symlink) |
183 | | do_syncfs(pg_wal); |
184 | | #endif /* HAVE_SYNCFS */ |
185 | | } |
186 | | break; |
187 | | |
188 | | case DATA_DIR_SYNC_METHOD_FSYNC: |
189 | | { |
190 | | char *exclude_dir = NULL; |
191 | | |
192 | | if (!sync_data_files) |
193 | | exclude_dir = psprintf("%s/base", pg_data); |
194 | | |
195 | | /* |
196 | | * If possible, hint to the kernel that we're soon going to |
197 | | * fsync the data directory and its contents. |
198 | | */ |
199 | | #ifdef PG_FLUSH_DATA_WORKS |
200 | | walkdir(pg_data, pre_sync_fname, false, exclude_dir); |
201 | | if (xlog_is_symlink) |
202 | | walkdir(pg_wal, pre_sync_fname, false, NULL); |
203 | | if (sync_data_files) |
204 | | walkdir(pg_tblspc, pre_sync_fname, true, NULL); |
205 | | #endif |
206 | | |
207 | | /* |
208 | | * Now we do the fsync()s in the same order. |
209 | | * |
210 | | * The main call ignores symlinks, so in addition to specially |
211 | | * processing pg_wal if it's a symlink, pg_tblspc has to be |
212 | | * visited separately with process_symlinks = true. Note that |
213 | | * if there are any plain directories in pg_tblspc, they'll |
214 | | * get fsync'd twice. That's not an expected case so we don't |
215 | | * worry about optimizing it. |
216 | | */ |
217 | | walkdir(pg_data, fsync_fname, false, exclude_dir); |
218 | | if (xlog_is_symlink) |
219 | | walkdir(pg_wal, fsync_fname, false, NULL); |
220 | | if (sync_data_files) |
221 | | walkdir(pg_tblspc, fsync_fname, true, NULL); |
222 | | |
223 | | if (exclude_dir) |
224 | | pfree(exclude_dir); |
225 | | } |
226 | | break; |
227 | | } |
228 | | } |
229 | | |
230 | | /* |
231 | | * Synchronize the given directory and all its contents. |
232 | | * |
233 | | * This is a convenient wrapper on top of walkdir() and do_syncfs(). |
234 | | */ |
235 | | void |
236 | | sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) |
237 | | { |
238 | | switch (sync_method) |
239 | | { |
240 | | case DATA_DIR_SYNC_METHOD_SYNCFS: |
241 | | { |
242 | | #ifndef HAVE_SYNCFS |
243 | | pg_log_error("this build does not support sync method \"%s\"", |
244 | | "syncfs"); |
245 | | exit(EXIT_FAILURE); |
246 | | #else |
247 | | /* |
248 | | * On Linux, we don't have to open every single file one by |
249 | | * one. We can use syncfs() to sync the whole filesystem. |
250 | | */ |
251 | | do_syncfs(dir); |
252 | | #endif /* HAVE_SYNCFS */ |
253 | | } |
254 | | break; |
255 | | |
256 | | case DATA_DIR_SYNC_METHOD_FSYNC: |
257 | | { |
258 | | /* |
259 | | * If possible, hint to the kernel that we're soon going to |
260 | | * fsync the data directory and its contents. |
261 | | */ |
262 | | #ifdef PG_FLUSH_DATA_WORKS |
263 | | walkdir(dir, pre_sync_fname, false, NULL); |
264 | | #endif |
265 | | |
266 | | walkdir(dir, fsync_fname, false, NULL); |
267 | | } |
268 | | break; |
269 | | } |
270 | | } |
271 | | |
272 | | /* |
273 | | * walkdir: recursively walk a directory, applying the action to each |
274 | | * regular file and directory (including the named directory itself). |
275 | | * |
276 | | * If process_symlinks is true, the action and recursion are also applied |
277 | | * to regular files and directories that are pointed to by symlinks in the |
278 | | * given directory; otherwise symlinks are ignored. Symlinks are always |
279 | | * ignored in subdirectories, ie we intentionally don't pass down the |
280 | | * process_symlinks flag to recursive calls. |
281 | | * |
282 | | * If exclude_dir is not NULL, it specifies a directory path to skip |
283 | | * processing. |
284 | | * |
285 | | * Errors are reported but not considered fatal. |
286 | | * |
287 | | * See also walkdir in fd.c, which is a backend version of this logic. |
288 | | */ |
289 | | static void |
290 | | walkdir(const char *path, |
291 | | int (*action) (const char *fname, bool isdir), |
292 | | bool process_symlinks, |
293 | | const char *exclude_dir) |
294 | | { |
295 | | DIR *dir; |
296 | | struct dirent *de; |
297 | | |
298 | | if (exclude_dir && strcmp(exclude_dir, path) == 0) |
299 | | return; |
300 | | |
301 | | dir = opendir(path); |
302 | | if (dir == NULL) |
303 | | { |
304 | | pg_log_error("could not open directory \"%s\": %m", path); |
305 | | return; |
306 | | } |
307 | | |
308 | | while (errno = 0, (de = readdir(dir)) != NULL) |
309 | | { |
310 | | char subpath[MAXPGPATH * 2]; |
311 | | |
312 | | if (strcmp(de->d_name, ".") == 0 || |
313 | | strcmp(de->d_name, "..") == 0) |
314 | | continue; |
315 | | |
316 | | snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name); |
317 | | |
318 | | switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR)) |
319 | | { |
320 | | case PGFILETYPE_REG: |
321 | | (*action) (subpath, false); |
322 | | break; |
323 | | case PGFILETYPE_DIR: |
324 | | walkdir(subpath, action, false, exclude_dir); |
325 | | break; |
326 | | default: |
327 | | |
328 | | /* |
329 | | * Errors are already reported directly by get_dirent_type(), |
330 | | * and any remaining symlinks and unknown file types are |
331 | | * ignored. |
332 | | */ |
333 | | break; |
334 | | } |
335 | | } |
336 | | |
337 | | if (errno) |
338 | | pg_log_error("could not read directory \"%s\": %m", path); |
339 | | |
340 | | (void) closedir(dir); |
341 | | |
342 | | /* |
343 | | * It's important to fsync the destination directory itself as individual |
344 | | * file fsyncs don't guarantee that the directory entry for the file is |
345 | | * synced. Recent versions of ext4 have made the window much wider but |
346 | | * it's been an issue for ext3 and other filesystems in the past. |
347 | | */ |
348 | | (*action) (path, true); |
349 | | } |
350 | | |
351 | | /* |
352 | | * Hint to the OS that it should get ready to fsync() this file, if supported |
353 | | * by the platform. |
354 | | * |
355 | | * Ignores errors trying to open unreadable files, and reports other errors |
356 | | * non-fatally. |
357 | | */ |
358 | | int |
359 | | pre_sync_fname(const char *fname, bool isdir) |
360 | | { |
361 | | #ifdef PG_FLUSH_DATA_WORKS |
362 | | int fd; |
363 | | |
364 | | fd = open(fname, O_RDONLY | PG_BINARY, 0); |
365 | | |
366 | | if (fd < 0) |
367 | | { |
368 | | if (errno == EACCES || (isdir && errno == EISDIR)) |
369 | | return 0; |
370 | | pg_log_error("could not open file \"%s\": %m", fname); |
371 | | return -1; |
372 | | } |
373 | | |
374 | | /* |
375 | | * We do what pg_flush_data() would do in the backend: prefer to use |
376 | | * sync_file_range, but fall back to posix_fadvise. We ignore errors |
377 | | * because this is only a hint. |
378 | | */ |
379 | | #if defined(HAVE_SYNC_FILE_RANGE) |
380 | | (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE); |
381 | | #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) |
382 | | (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); |
383 | | #else |
384 | | #error PG_FLUSH_DATA_WORKS should not have been defined |
385 | | #endif |
386 | | |
387 | | (void) close(fd); |
388 | | #endif /* PG_FLUSH_DATA_WORKS */ |
389 | | return 0; |
390 | | } |
391 | | |
392 | | /* |
393 | | * fsync_fname -- Try to fsync a file or directory |
394 | | * |
395 | | * Ignores errors trying to open unreadable files, or trying to fsync |
396 | | * directories on systems where that isn't allowed/required. All other errors |
397 | | * are fatal. |
398 | | */ |
399 | | int |
400 | | fsync_fname(const char *fname, bool isdir) |
401 | | { |
402 | | int fd; |
403 | | int flags; |
404 | | int returncode; |
405 | | |
406 | | /* |
407 | | * Some OSs require directories to be opened read-only whereas other |
408 | | * systems don't allow us to fsync files opened read-only; so we need both |
409 | | * cases here. Using O_RDWR will cause us to fail to fsync files that are |
410 | | * not writable by our userid, but we assume that's OK. |
411 | | */ |
412 | | flags = PG_BINARY; |
413 | | if (!isdir) |
414 | | flags |= O_RDWR; |
415 | | else |
416 | | flags |= O_RDONLY; |
417 | | |
418 | | /* |
419 | | * Open the file, silently ignoring errors about unreadable files (or |
420 | | * unsupported operations, e.g. opening a directory under Windows), and |
421 | | * logging others. |
422 | | */ |
423 | | fd = open(fname, flags, 0); |
424 | | if (fd < 0) |
425 | | { |
426 | | if (errno == EACCES || (isdir && errno == EISDIR)) |
427 | | return 0; |
428 | | pg_log_error("could not open file \"%s\": %m", fname); |
429 | | return -1; |
430 | | } |
431 | | |
432 | | returncode = fsync(fd); |
433 | | |
434 | | /* |
435 | | * Some OSes don't allow us to fsync directories at all, so we can ignore |
436 | | * those errors. Anything else needs to be reported. |
437 | | */ |
438 | | if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL))) |
439 | | { |
440 | | pg_log_error("could not fsync file \"%s\": %m", fname); |
441 | | (void) close(fd); |
442 | | exit(EXIT_FAILURE); |
443 | | } |
444 | | |
445 | | (void) close(fd); |
446 | | return 0; |
447 | | } |
448 | | |
449 | | /* |
450 | | * fsync_parent_path -- fsync the parent path of a file or directory |
451 | | * |
452 | | * This is aimed at making file operations persistent on disk in case of |
453 | | * an OS crash or power failure. |
454 | | */ |
455 | | int |
456 | | fsync_parent_path(const char *fname) |
457 | | { |
458 | | char parentpath[MAXPGPATH]; |
459 | | |
460 | | strlcpy(parentpath, fname, MAXPGPATH); |
461 | | get_parent_directory(parentpath); |
462 | | |
463 | | /* |
464 | | * get_parent_directory() returns an empty string if the input argument is |
465 | | * just a file name (see comments in path.c), so handle that as being the |
466 | | * current directory. |
467 | | */ |
468 | | if (strlen(parentpath) == 0) |
469 | | strlcpy(parentpath, ".", MAXPGPATH); |
470 | | |
471 | | if (fsync_fname(parentpath, true) != 0) |
472 | | return -1; |
473 | | |
474 | | return 0; |
475 | | } |
476 | | |
477 | | /* |
478 | | * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability |
479 | | * |
480 | | * Wrapper around rename, similar to the backend version. |
481 | | */ |
482 | | int |
483 | | durable_rename(const char *oldfile, const char *newfile) |
484 | | { |
485 | | int fd; |
486 | | |
487 | | /* |
488 | | * First fsync the old and target path (if it exists), to ensure that they |
489 | | * are properly persistent on disk. Syncing the target file is not |
490 | | * strictly necessary, but it makes it easier to reason about crashes; |
491 | | * because it's then guaranteed that either source or target file exists |
492 | | * after a crash. |
493 | | */ |
494 | | if (fsync_fname(oldfile, false) != 0) |
495 | | return -1; |
496 | | |
497 | | fd = open(newfile, PG_BINARY | O_RDWR, 0); |
498 | | if (fd < 0) |
499 | | { |
500 | | if (errno != ENOENT) |
501 | | { |
502 | | pg_log_error("could not open file \"%s\": %m", newfile); |
503 | | return -1; |
504 | | } |
505 | | } |
506 | | else |
507 | | { |
508 | | if (fsync(fd) != 0) |
509 | | { |
510 | | pg_log_error("could not fsync file \"%s\": %m", newfile); |
511 | | close(fd); |
512 | | exit(EXIT_FAILURE); |
513 | | } |
514 | | close(fd); |
515 | | } |
516 | | |
517 | | /* Time to do the real deal... */ |
518 | | if (rename(oldfile, newfile) != 0) |
519 | | { |
520 | | pg_log_error("could not rename file \"%s\" to \"%s\": %m", |
521 | | oldfile, newfile); |
522 | | return -1; |
523 | | } |
524 | | |
525 | | /* |
526 | | * To guarantee renaming the file is persistent, fsync the file with its |
527 | | * new name, and its containing directory. |
528 | | */ |
529 | | if (fsync_fname(newfile, false) != 0) |
530 | | return -1; |
531 | | |
532 | | if (fsync_parent_path(newfile) != 0) |
533 | | return -1; |
534 | | |
535 | | return 0; |
536 | | } |
537 | | |
538 | | #endif /* FRONTEND */ |
539 | | |
540 | | /* |
541 | | * Return the type of a directory entry. |
542 | | * |
543 | | * In frontend code, elevel should be a level from logging.h; in backend code |
544 | | * it should be a level from elog.h. |
545 | | */ |
546 | | PGFileType |
547 | | get_dirent_type(const char *path, |
548 | | const struct dirent *de, |
549 | | bool look_through_symlinks, |
550 | | int elevel) |
551 | 0 | { |
552 | 0 | PGFileType result; |
553 | | |
554 | | /* |
555 | | * Some systems tell us the type directly in the dirent struct, but that's |
556 | | * a BSD and Linux extension not required by POSIX. Even when the |
557 | | * interface is present, sometimes the type is unknown, depending on the |
558 | | * filesystem. |
559 | | */ |
560 | 0 | #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK) |
561 | 0 | if (de->d_type == DT_REG) |
562 | 0 | result = PGFILETYPE_REG; |
563 | 0 | else if (de->d_type == DT_DIR) |
564 | 0 | result = PGFILETYPE_DIR; |
565 | 0 | else if (de->d_type == DT_LNK && !look_through_symlinks) |
566 | 0 | result = PGFILETYPE_LNK; |
567 | 0 | else |
568 | 0 | result = PGFILETYPE_UNKNOWN; |
569 | | #else |
570 | | result = PGFILETYPE_UNKNOWN; |
571 | | #endif |
572 | |
|
573 | 0 | if (result == PGFILETYPE_UNKNOWN) |
574 | 0 | { |
575 | 0 | struct stat fst; |
576 | 0 | int sret; |
577 | | |
578 | |
|
579 | 0 | if (look_through_symlinks) |
580 | 0 | sret = stat(path, &fst); |
581 | 0 | else |
582 | 0 | sret = lstat(path, &fst); |
583 | |
|
584 | 0 | if (sret < 0) |
585 | 0 | { |
586 | 0 | result = PGFILETYPE_ERROR; |
587 | | #ifdef FRONTEND |
588 | | pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path); |
589 | | #else |
590 | 0 | ereport(elevel, |
591 | 0 | (errcode_for_file_access(), |
592 | 0 | errmsg("could not stat file \"%s\": %m", path))); |
593 | 0 | #endif |
594 | 0 | } |
595 | 0 | else if (S_ISREG(fst.st_mode)) |
596 | 0 | result = PGFILETYPE_REG; |
597 | 0 | else if (S_ISDIR(fst.st_mode)) |
598 | 0 | result = PGFILETYPE_DIR; |
599 | 0 | else if (S_ISLNK(fst.st_mode)) |
600 | 0 | result = PGFILETYPE_LNK; |
601 | 0 | } |
602 | | |
603 | 0 | return result; |
604 | 0 | } |
605 | | |
606 | | /* |
607 | | * Compute what remains to be done after a possibly partial vectored read or |
608 | | * write. The part of 'source' beginning after 'transferred' bytes is copied |
609 | | * to 'destination', and its length is returned. 'source' and 'destination' |
610 | | * may point to the same array, for in-place adjustment. A return value of |
611 | | * zero indicates completion (for callers without a cheaper way to know that). |
612 | | */ |
613 | | int |
614 | | compute_remaining_iovec(struct iovec *destination, |
615 | | const struct iovec *source, |
616 | | int iovcnt, |
617 | | size_t transferred) |
618 | 0 | { |
619 | 0 | Assert(iovcnt > 0); |
620 | | |
621 | | /* Skip wholly transferred iovecs. */ |
622 | 0 | while (source->iov_len <= transferred) |
623 | 0 | { |
624 | 0 | transferred -= source->iov_len; |
625 | 0 | source++; |
626 | 0 | iovcnt--; |
627 | | |
628 | | /* All iovecs transferred? */ |
629 | 0 | if (iovcnt == 0) |
630 | 0 | { |
631 | | /* |
632 | | * We don't expect the kernel to transfer more than we asked it |
633 | | * to, or something is out of sync. |
634 | | */ |
635 | 0 | Assert(transferred == 0); |
636 | 0 | return 0; |
637 | 0 | } |
638 | 0 | } |
639 | | |
640 | | /* Copy the remaining iovecs to the front of the array. */ |
641 | 0 | if (source != destination) |
642 | 0 | memmove(destination, source, sizeof(*source) * iovcnt); |
643 | | |
644 | | /* Adjust leading iovec, which may have been partially transferred. */ |
645 | 0 | Assert(destination->iov_len > transferred); |
646 | 0 | destination->iov_base = (char *) destination->iov_base + transferred; |
647 | 0 | destination->iov_len -= transferred; |
648 | |
|
649 | 0 | return iovcnt; |
650 | 0 | } |
651 | | |
652 | | /* |
653 | | * pg_pwritev_with_retry |
654 | | * |
655 | | * Convenience wrapper for pg_pwritev() that retries on partial write. If an |
656 | | * error is returned, it is unspecified how much has been written. |
657 | | */ |
658 | | ssize_t |
659 | | pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) |
660 | 0 | { |
661 | 0 | struct iovec iov_copy[PG_IOV_MAX]; |
662 | 0 | ssize_t sum = 0; |
663 | 0 | ssize_t part; |
664 | | |
665 | | /* We'd better have space to make a copy, in case we need to retry. */ |
666 | 0 | if (iovcnt > PG_IOV_MAX) |
667 | 0 | { |
668 | 0 | errno = EINVAL; |
669 | 0 | return -1; |
670 | 0 | } |
671 | | |
672 | 0 | do |
673 | 0 | { |
674 | | /* Write as much as we can. */ |
675 | 0 | part = pg_pwritev(fd, iov, iovcnt, offset); |
676 | 0 | if (part < 0) |
677 | 0 | return -1; |
678 | | |
679 | | #ifdef SIMULATE_SHORT_WRITE |
680 | | part = Min(part, 4096); |
681 | | #endif |
682 | | |
683 | | /* Count our progress. */ |
684 | 0 | sum += part; |
685 | 0 | offset += part; |
686 | | |
687 | | /* |
688 | | * See what is left. On the first loop we used the caller's array, |
689 | | * but in later loops we'll use our local copy that we are allowed to |
690 | | * mutate. |
691 | | */ |
692 | 0 | iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part); |
693 | 0 | iov = iov_copy; |
694 | 0 | } while (iovcnt > 0); |
695 | | |
696 | 0 | return sum; |
697 | 0 | } |
698 | | |
699 | | /* |
700 | | * pg_pwrite_zeros |
701 | | * |
702 | | * Writes zeros to file worth "size" bytes at "offset" (from the start of the |
703 | | * file), using vectored I/O. |
704 | | * |
705 | | * Returns the total amount of data written. On failure, a negative value |
706 | | * is returned with errno set. |
707 | | */ |
708 | | ssize_t |
709 | | pg_pwrite_zeros(int fd, size_t size, off_t offset) |
710 | 0 | { |
711 | 0 | static const PGIOAlignedBlock zbuffer = {0}; /* worth BLCKSZ */ |
712 | 0 | void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data; |
713 | 0 | struct iovec iov[PG_IOV_MAX]; |
714 | 0 | size_t remaining_size = size; |
715 | 0 | ssize_t total_written = 0; |
716 | | |
717 | | /* Loop, writing as many blocks as we can for each system call. */ |
718 | 0 | while (remaining_size > 0) |
719 | 0 | { |
720 | 0 | int iovcnt = 0; |
721 | 0 | ssize_t written; |
722 | |
|
723 | 0 | for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++) |
724 | 0 | { |
725 | 0 | size_t this_iov_size; |
726 | |
|
727 | 0 | iov[iovcnt].iov_base = zerobuf_addr; |
728 | |
|
729 | 0 | if (remaining_size < BLCKSZ) |
730 | 0 | this_iov_size = remaining_size; |
731 | 0 | else |
732 | 0 | this_iov_size = BLCKSZ; |
733 | |
|
734 | 0 | iov[iovcnt].iov_len = this_iov_size; |
735 | 0 | remaining_size -= this_iov_size; |
736 | 0 | } |
737 | |
|
738 | 0 | written = pg_pwritev_with_retry(fd, iov, iovcnt, offset); |
739 | |
|
740 | 0 | if (written < 0) |
741 | 0 | return written; |
742 | | |
743 | 0 | offset += written; |
744 | 0 | total_written += written; |
745 | 0 | } |
746 | | |
747 | 0 | Assert(total_written == size); |
748 | |
|
749 | 0 | return total_written; |
750 | 0 | } |