Coverage Report

Created: 2024-06-18 06:05

/src/leptonica/src/utils2.c
Line
Count
Source (jump to first uncovered line)
1
/*====================================================================*
2
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
3
 -
4
 -  Redistribution and use in source and binary forms, with or without
5
 -  modification, are permitted provided that the following conditions
6
 -  are met:
7
 -  1. Redistributions of source code must retain the above copyright
8
 -     notice, this list of conditions and the following disclaimer.
9
 -  2. Redistributions in binary form must reproduce the above
10
 -     copyright notice, this list of conditions and the following
11
 -     disclaimer in the documentation and/or other materials
12
 -     provided with the distribution.
13
 -
14
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
 *====================================================================*/
26
27
/*!
28
 * \file utils2.c
29
 * <pre>
30
 *
31
 *      ------------------------------------------
32
 *      This file has these utilities:
33
 *         - safe string operations
34
 *         - find/replace operations on strings
35
 *         - read/write between file and memory
36
 *         - multi-platform file and directory operations
37
 *         - file name operations
38
 *      ------------------------------------------
39
 *
40
 *       Safe string procs
41
 *           char      *stringNew()
42
 *           l_int32    stringCopy()
43
 *           l_int32    stringCopySegment()
44
 *           l_int32    stringReplace()
45
 *           l_int32    stringLength()
46
 *           l_int32    stringCat()
47
 *           char      *stringConcatNew()
48
 *           char      *stringJoin()
49
 *           l_int32    stringJoinIP()
50
 *           char      *stringReverse()
51
 *           char      *strtokSafe()
52
 *           l_int32    stringSplitOnToken()
53
 *
54
 *       Find and replace string and array procs
55
 *           l_int32    stringCheckForChars()
56
 *           char      *stringRemoveChars()
57
 *           char      *stringReplaceEachSubstr()
58
 *           char      *stringReplaceSubstr()
59
 *           L_DNA     *stringFindEachSubstr()
60
 *           l_int32    stringFindSubstr()
61
 *           l_uint8   *arrayReplaceEachSequence()
62
 *           L_DNA     *arrayFindEachSequence()
63
 *           l_int32    arrayFindSequence()
64
 *
65
 *       Safe realloc
66
 *           void      *reallocNew()
67
 *
68
 *       Read and write between file and memory
69
 *           l_uint8   *l_binaryRead()
70
 *           l_uint8   *l_binaryReadStream()
71
 *           l_uint8   *l_binaryReadSelect()
72
 *           l_uint8   *l_binaryReadSelectStream()
73
 *           l_int32    l_binaryWrite()
74
 *           l_int32    nbytesInFile()
75
 *           l_int32    fnbytesInFile()
76
 *
77
 *       Copy and compare in memory
78
 *           l_uint8   *l_binaryCopy()
79
 *           l_uint8   *l_binaryCompare()
80
 *
81
 *       File copy operations
82
 *           l_int32    fileCopy()
83
 *           l_int32    fileConcatenate()
84
 *           l_int32    fileAppendString()
85
 *
86
 *       File split operations
87
 *           l_int32    fileSplitLinesUniform()
88
 *
89
 *       Multi-platform functions for opening file streams
90
 *           FILE      *fopenReadStream()
91
 *           FILE      *fopenWriteStream()
92
 *           FILE      *fopenReadFromMemory()
93
 *
94
 *       Opening a Windows tmpfile for writing
95
 *           FILE      *fopenWriteWinTempfile()
96
 *
97
 *       Multi-platform functions that avoid C-runtime boundary crossing
98
 *       with Windows DLLs  (use in programs only)
99
 *           FILE      *lept_fopen()
100
 *           l_int32    lept_fclose()
101
 *           void      *lept_calloc()
102
 *           void       lept_free()
103
 *
104
 *       Multi-platform file system operations in temp directories
105
 *           l_int32    lept_mkdir()
106
 *           l_int32    lept_rmdir()
107
 *           l_int32    lept_direxists()
108
 *           l_int32    lept_mv()
109
 *           l_int32    lept_rm_match()
110
 *           l_int32    lept_rm()
111
 *           l_int32    lept_rmfile()
112
 *           l_int32    lept_cp()
113
 *
114
 *       Special debug/test function for calling 'system'
115
 *           l_int32    callSystemDebug()
116
 *
117
 *       General file name operations
118
 *           l_int32    splitPathAtDirectory()
119
 *           l_int32    splitPathAtExtension()
120
 *           char      *pathJoin()
121
 *           char      *appendSubdirs()
122
 *
123
 *       Special file name operations
124
 *           l_int32    convertSepCharsInPath()
125
 *           char      *genPathname()
126
 *           l_int32    makeTempDirname()
127
 *           l_int32    modifyTrailingSlash()
128
 *           char      *l_makeTempFilename()
129
 *           l_int32    extractNumberFromFilename()
130
 *
131
 *
132
 *  Notes on multi-platform development
133
 *  -----------------------------------
134
 *  This is important:
135
 *  (1) With the exception of splitPathAtDirectory(), splitPathAtExtension()
136
  *     and genPathname(), all input pathnames must have unix separators.
137
 *  (2) On macOS, iOS and Windows, for read or write to "/tmp/..."
138
 *      the filename is rewritten to use the OS specific temp directory:
139
 *         /tmp  ==>   [Temp]/...
140
 *  (3) This filename rewrite, along with the conversion from unix
141
 *      to OS specific pathnames, happens in genPathname().
142
 *  (4) Use fopenReadStream() and fopenWriteStream() to open files,
143
 *      because these use genPathname() to find the platform-dependent
144
 *      filenames.  Likewise for l_binaryRead() and l_binaryWrite().
145
 *  (5) For moving, copying and removing files and directories that are in
146
 *      subdirectories of /tmp, use the lept_*() file system shell wrappers:
147
 *         lept_mkdir(), lept_rmdir(), lept_mv(), lept_rm() and lept_cp().
148
 *  (6) For programs use the lept_fopen(), lept_fclose(), lept_calloc()
149
 *      and lept_free() C library wrappers.  These work properly on Windows,
150
 *      where the same DLL must perform complementary operations on
151
 *      file streams (open/close) and heap memory (malloc/free).
152
 *  (7) Why read and write files to temp directories?
153
 *      The library needs the ability to read and write ephemeral
154
 *      files to default places, both for generating debugging output
155
 *      and for supporting regression tests.  Applications also need
156
 *      this ability for debugging.
157
 *  (8) Why do the pathname rewrite on macOS, iOS and Windows?
158
 *      The goal is to have the library, and programs using the library,
159
 *      run on multiple platforms without changes.  The location of
160
 *      temporary files depends on the platform as well as the user's
161
 *      configuration.  Temp files on some operating systems are in some
162
 *      directory not known a priori.  To make everything work seamlessly on
163
 *      any OS, every time you open a file for reading or writing,
164
 *      use a special function such as fopenReadStream() or
165
 *      fopenWriteStream(); these call genPathname() to ensure that
166
 *      if it is a temp file, the correct path is used.  To indicate
167
 *      that this is a temp file, the application is written with the
168
 *      root directory of the path in a canonical form: "/tmp".
169
 *  (9) Why is it that multi-platform directory functions like lept_mkdir()
170
 *      and lept_rmdir(), as well as associated file functions like
171
 *      lept_rm(), lept_mv() and lept_cp(), only work in the temp dir?
172
 *      These functions were designed to provide easy manipulation of
173
 *      temp files.  The restriction to temp files is for safety -- to
174
 *      prevent an accidental deletion of important files.  For example,
175
 *      lept_rmdir() first deletes all files in a specified subdirectory
176
 *      of temp, and then removes the directory.
177
 *
178
 * </pre>
179
 */
180
181
#ifdef HAVE_CONFIG_H
182
#include <config_auto.h>
183
#endif  /* HAVE_CONFIG_H */
184
185
#ifdef _MSC_VER
186
#include <process.h>
187
#include <direct.h>
188
#define getcwd _getcwd  /* fix MSVC warning */
189
#else
190
#include <unistd.h>
191
#endif   /* _MSC_VER */
192
193
#ifdef _WIN32
194
#include <windows.h>
195
#include <fcntl.h>     /* _O_CREAT, ... */
196
#include <io.h>        /* _open */
197
#include <sys/stat.h>  /* _S_IREAD, _S_IWRITE */
198
#else
199
#include <sys/stat.h>  /* for stat, mkdir(2) */
200
#include <sys/types.h>
201
#endif
202
203
#ifdef __APPLE__
204
#include <unistd.h>
205
#include <errno.h>
206
#endif
207
208
#include <string.h>
209
#include <stddef.h>
210
#include "allheaders.h"
211
212
#if defined(__APPLE__) || defined(_WIN32)
213
/* Rewrite paths starting with /tmp for macOS, iOS and Windows. */
214
#define REWRITE_TMP
215
#endif
216
217
/*--------------------------------------------------------------------*
218
 *                       Safe string operations                       *
219
 *--------------------------------------------------------------------*/
220
/*!
221
 * \brief   stringNew()
222
 *
223
 * \param[in]    src
224
 * \return  dest copy of %src string, or NULL on error
225
 */
226
char *
227
stringNew(const char  *src)
228
16.8k
{
229
16.8k
l_int32  len;
230
16.8k
char    *dest;
231
232
16.8k
    if (!src) {
233
0
        L_WARNING("src not defined\n", __func__);
234
0
        return NULL;
235
0
    }
236
237
16.8k
    len = strlen(src);
238
16.8k
    if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
239
0
        return (char *)ERROR_PTR("dest not made", __func__, NULL);
240
241
16.8k
    stringCopy(dest, src, len);
242
16.8k
    return dest;
243
16.8k
}
244
245
246
/*!
247
 * \brief   stringCopy()
248
 *
249
 * \param[in]    dest    existing byte buffer
250
 * \param[in]    src     string [optional] can be null
251
 * \param[in]    n       max number of characters to copy
252
 * \return  0 if OK, 1 on error
253
 *
254
 * <pre>
255
 * Notes:
256
 *      (1) Relatively safe wrapper for strncpy, that checks the input,
257
 *          and does not complain if %src is null or %n < 1.
258
 *          If %n < 1, this is a no-op.
259
 *      (2) %dest needs to be at least %n bytes in size.
260
 *      (3) We don't call strncpy() because valgrind complains about
261
 *          use of uninitialized values.
262
 * </pre>
263
 */
264
l_ok
265
stringCopy(char        *dest,
266
           const char  *src,
267
           l_int32      n)
268
23.3k
{
269
23.3k
l_int32  i;
270
271
23.3k
    if (!dest)
272
0
        return ERROR_INT("dest not defined", __func__, 1);
273
23.3k
    if (!src || n < 1)
274
0
        return 0;
275
276
        /* Implementation of strncpy that valgrind doesn't complain about */
277
277k
    for (i = 0; i < n && src[i] != '\0'; i++)
278
254k
        dest[i] = src[i];
279
23.3k
    for (; i < n; i++)
280
0
        dest[i] = '\0';
281
23.3k
    return 0;
282
23.3k
}
283
284
285
/*!
286
 * \brief   stringCopySegment()
287
 *
288
 *
289
 * \param[in]    src      string
290
 * \param[in]    start    byte position at start of segment
291
 * \param[in]    nbytes   number of bytes in the segment; use 0 to go to end
292
 * \return  copy of segment, or NULL on error
293
 *
294
 * <pre>
295
 * Notes:
296
 *      (1) This is a variant of stringNew() that makes a new string
297
 *          from a segment of the input string.  The segment is specified
298
 *          by the starting position and the number of bytes.
299
 *      (2) The start location %start must be within the string %src.
300
 *      (3) The copy is truncated to the end of the source string.
301
 *          Use %nbytes = 0 to copy to the end of %src.
302
 * </pre>
303
 */
304
char *
305
stringCopySegment(const char  *src,
306
                  l_int32      start,
307
                  l_int32      nbytes)
308
0
{
309
0
char    *dest;
310
0
l_int32  len;
311
312
0
    if (!src)
313
0
        return (char *)ERROR_PTR("src not defined", __func__, NULL);
314
0
    len = strlen(src);
315
0
    if (start < 0 || start > len - 1)
316
0
        return (char *)ERROR_PTR("invalid start", __func__, NULL);
317
0
    if (nbytes <= 0)  /* copy to the end */
318
0
        nbytes = len - start;
319
0
    if (start + nbytes > len)  /* truncate to the end */
320
0
        nbytes = len - start;
321
0
    if ((dest = (char *)LEPT_CALLOC(nbytes + 1, sizeof(char))) == NULL)
322
0
        return (char *)ERROR_PTR("dest not made", __func__, NULL);
323
0
    stringCopy(dest, src + start, nbytes);
324
0
    return dest;
325
0
}
326
327
328
/*!
329
 * \brief   stringReplace()
330
 *
331
 * \param[out]   pdest    string copy
332
 * \param[in]    src      [optional] string; can be null
333
 * \return  0 if OK; 1 on error
334
 *
335
 * <pre>
336
 * Notes:
337
 *      (1) Frees any existing dest string
338
 *      (2) Puts a copy of src string in the dest
339
 *      (3) If either or both strings are null, does something reasonable.
340
 * </pre>
341
 */
342
l_ok
343
stringReplace(char       **pdest,
344
              const char  *src)
345
4.69M
{
346
4.69M
    if (!pdest)
347
0
        return ERROR_INT("pdest not defined", __func__, 1);
348
349
4.69M
    if (*pdest)
350
0
        LEPT_FREE(*pdest);
351
352
4.69M
    if (src)
353
0
        *pdest = stringNew(src);
354
4.69M
    else
355
4.69M
        *pdest = NULL;
356
4.69M
    return 0;
357
4.69M
}
358
359
360
/*!
361
 * \brief   stringLength()
362
 *
363
 * \param[in]    src    string can be null or NULL-terminated string
364
 * \param[in]    size   number of bytes to check; e.g., size of src buffer
365
 * \return  length of src in bytes; 0 if no bytes are found;
366
 *                                  %size on error when NUL byte is not found.
367
 *
368
 * <pre>
369
 * Notes:
370
 *      (1) Safe implementation of strlen that only checks %size bytes
371
 *          for trailing NUL.
372
 *      (2) Valid returned string lengths are between 0 and size - 1.
373
 *          If %size bytes are checked without finding a NUL byte, then
374
 *          an error is indicated by returning %size.
375
 * </pre>
376
 */
377
l_int32
378
stringLength(const char  *src,
379
             size_t       size)
380
0
{
381
0
l_int32  i;
382
383
0
    if (!src)
384
0
        return 0;
385
0
    if (size < 1)
386
0
        return ERROR_INT("size < 1; too small", __func__, 0);
387
388
0
    for (i = 0; i < size; i++) {
389
0
        if (src[i] == '\0')
390
0
            return i;
391
0
    }
392
393
        /* Didn't find a NUL byte */
394
0
    L_ERROR("NUL byte not found in %zu bytes\n", __func__, size);
395
0
    return size;
396
0
}
397
398
399
/*!
400
 * \brief   stringCat()
401
 *
402
 * \param[in]    dest    null-terminated byte buffer
403
 * \param[in]    size    size of dest buffer
404
 * \param[in]    src     string can be null or NULL-terminated string
405
 * \return  number of bytes added to dest; -1 on error
406
 *
407
 * <pre>
408
 * Notes:
409
 *      (1) Alternative implementation of strncat, that checks the input,
410
 *          is easier to use (since the size of the dest buffer is specified
411
 *          rather than the number of bytes to copy), and does not complain
412
 *          if %src is null.
413
 *      (2) Never writes past end of dest.
414
 *      (3) If there is not enough room to append the src, which is an error,
415
 *          it does nothing.
416
 *      (4) N.B. The order of 2nd and 3rd args is reversed from that in
417
 *          strncat, as in the Windows function strcat_s().
418
 * </pre>
419
 */
420
l_int32
421
stringCat(char        *dest,
422
          size_t       size,
423
          const char  *src)
424
0
{
425
0
l_int32  i, n;
426
0
l_int32  lendest, lensrc;
427
428
0
    if (!dest)
429
0
        return ERROR_INT("dest not defined", __func__, -1);
430
0
    if (size < 1)
431
0
        return ERROR_INT("size < 1; too small", __func__, -1);
432
0
    if (!src)
433
0
        return 0;
434
435
0
    lendest = stringLength(dest, size);
436
0
    if (lendest == size)
437
0
        return ERROR_INT("no terminating nul byte", __func__, -1);
438
0
    lensrc = stringLength(src, size);
439
0
    if (lensrc == 0)
440
0
        return 0;  /* nothing added to dest */
441
0
    n = (lendest + lensrc > size - 1) ? 0 : lensrc;
442
0
    if (n == 0)
443
0
        return ERROR_INT("dest too small for append", __func__, -1);
444
445
0
    for (i = 0; i < n; i++)
446
0
        dest[lendest + i] = src[i];
447
0
    dest[lendest + n] = '\0';
448
0
    return n;
449
0
}
450
451
452
/*!
453
 * \brief   stringConcatNew()
454
 *
455
 * \param[in]    first    first string in list
456
 * \param[in]    ...      NULL-terminated list of strings
457
 * \return  result new string concatenating the input strings, or
458
 *                      NULL if first == NULL
459
 *
460
 * <pre>
461
 * Notes:
462
 *      (1) The last arg in the list of strings must be NULL.
463
 *      (2) Caller must free the returned string.
464
 * </pre>
465
 */
466
char *
467
stringConcatNew(const char  *first, ...)
468
0
{
469
0
size_t       len;
470
0
char        *result, *ptr;
471
0
const char  *arg;
472
0
va_list      args;
473
474
0
    if (!first) return NULL;
475
476
        /* Find the length of the output string */
477
0
    va_start(args, first);
478
0
    len = strlen(first);
479
0
    while ((arg = va_arg(args, const char *)) != NULL)
480
0
        len += strlen(arg);
481
0
    va_end(args);
482
0
    result = (char *)LEPT_CALLOC(len + 1, sizeof(char));
483
484
        /* Concatenate the args */
485
0
    va_start(args, first);
486
0
    ptr = result;
487
0
    arg = first;
488
0
    while (*arg)
489
0
        *ptr++ = *arg++;
490
0
    while ((arg = va_arg(args, const char *)) != NULL) {
491
0
        while (*arg)
492
0
            *ptr++ = *arg++;
493
0
    }
494
0
    va_end(args);
495
0
    return result;
496
0
}
497
498
499
/*!
500
 * \brief   stringJoin()
501
 *
502
 * \param[in]    src1    [optional] string; can be null
503
 * \param[in]    src2    [optional] string; can be null
504
 * \return  concatenated string, or NULL on error
505
 *
506
 * <pre>
507
 * Notes:
508
 *      (1) This is a safe version of strcat; it makes a new string.
509
 *      (2) It is not an error if either or both of the strings
510
 *          are empty, or if either or both of the pointers are null.
511
 * </pre>
512
 */
513
char *
514
stringJoin(const char  *src1,
515
           const char  *src2)
516
0
{
517
0
char    *dest;
518
0
l_int32  srclen1, srclen2, destlen;
519
520
0
    srclen1 = (src1) ? strlen(src1) : 0;
521
0
    srclen2 = (src2) ? strlen(src2) : 0;
522
0
    destlen = srclen1 + srclen2 + 3;
523
524
0
    if ((dest = (char *)LEPT_CALLOC(destlen, sizeof(char))) == NULL)
525
0
        return (char *)ERROR_PTR("calloc fail for dest", __func__, NULL);
526
527
0
    if (src1)
528
0
        stringCat(dest, destlen, src1);
529
0
    if (src2)
530
0
        stringCat(dest, destlen, src2);
531
0
    return dest;
532
0
}
533
534
535
/*!
536
 * \brief   stringJoinIP()
537
 *
538
 * \param[in,out]  psrc1   address of string src1; cannot be on the stack
539
 * \param[in]      src2    [optional] string; can be null
540
 * \return  0 if OK, 1 on error
541
 *
542
 * <pre>
543
 * Notes:
544
 *      (1) This is a safe in-place version of strcat.  The contents of
545
 *          src1 is replaced by the concatenation of src1 and src2.
546
 *      (2) It is not an error if either or both of the strings
547
 *          are empty (""), or if the pointers to the strings (*psrc1, src2)
548
 *          are null.
549
 *      (3) src1 should be initialized to null or an empty string
550
 *          before the first call.  Use one of these:
551
 *              char *src1 = NULL;
552
 *              char *src1 = stringNew("");
553
 *          Then call with:
554
 *              stringJoinIP(&src1, src2);
555
 *      (4) This can also be implemented as a macro:
556
 * \code
557
 *              #define stringJoinIP(src1, src2) \
558
 *                  {tmpstr = stringJoin((src1),(src2)); \
559
 *                  LEPT_FREE(src1); \
560
 *                  (src1) = tmpstr;}
561
 * \endcode
562
 *      (5) Another function to consider for joining many strings is
563
 *          stringConcatNew().
564
 * </pre>
565
 */
566
l_ok
567
stringJoinIP(char       **psrc1,
568
             const char  *src2)
569
0
{
570
0
char  *tmpstr;
571
572
0
    if (!psrc1)
573
0
        return ERROR_INT("&src1 not defined", __func__, 1);
574
575
0
    tmpstr = stringJoin(*psrc1, src2);
576
0
    LEPT_FREE(*psrc1);
577
0
    *psrc1 = tmpstr;
578
0
    return 0;
579
0
}
580
581
582
/*!
583
 * \brief   stringReverse()
584
 *
585
 * \param[in]    src    string
586
 * \return  dest newly-allocated reversed string
587
 */
588
char *
589
stringReverse(const char  *src)
590
0
{
591
0
char    *dest;
592
0
l_int32  i, len;
593
594
0
    if (!src)
595
0
        return (char *)ERROR_PTR("src not defined", __func__, NULL);
596
0
    len = strlen(src);
597
0
    if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
598
0
        return (char *)ERROR_PTR("calloc fail for dest", __func__, NULL);
599
0
    for (i = 0; i < len; i++)
600
0
        dest[i] = src[len - 1 - i];
601
602
0
    return dest;
603
0
}
604
605
606
/*!
607
 * \brief   strtokSafe()
608
 *
609
 * \param[in]    cstr      input string to be sequentially parsed;
610
 *                         use NULL after the first call
611
 * \param[in]    seps      a string of character separators
612
 * \param[out]   psaveptr  ptr to the next char after
613
 *                         the last encountered separator
614
 * \return  substr         a new string that is copied from the previous
615
 *                         saveptr up to but not including the next
616
 *                         separator character, or NULL if end of cstr.
617
 *
618
 * <pre>
619
 * Notes:
620
 *      (1) This is a thread-safe implementation of strtok.
621
 *      (2) It has the same interface as strtok_r.
622
 *      (3) It differs from strtok_r in usage in two respects:
623
 *          (a) the input string is not altered
624
 *          (b) each returned substring is newly allocated and must
625
 *              be freed after use.
626
 *      (4) Let me repeat that.  This is "safe" because the input
627
 *          string is not altered and because each returned string
628
 *          is newly allocated on the heap.
629
 *      (5) It is here because, surprisingly, some C libraries don't
630
 *          include strtok_r.
631
 *      (6) Important usage points:
632
 *          ~ Input the string to be parsed on the first invocation.
633
 *          ~ Then input NULL after that; the value returned in saveptr
634
 *            is used in all subsequent calls.
635
 *      (7) This is only slightly slower than strtok_r.
636
 * </pre>
637
 */
638
char *
639
strtokSafe(char        *cstr,
640
           const char  *seps,
641
           char       **psaveptr)
642
2.59k
{
643
2.59k
char     nextc;
644
2.59k
char    *start, *substr;
645
2.59k
l_int32  istart, i, j, nchars;
646
647
2.59k
    if (!seps)
648
0
        return (char *)ERROR_PTR("seps not defined", __func__, NULL);
649
2.59k
    if (!psaveptr)
650
0
        return (char *)ERROR_PTR("&saveptr not defined", __func__, NULL);
651
652
2.59k
    if (!cstr) {
653
1.29k
        start = *psaveptr;
654
1.29k
    } else {
655
1.29k
        start = cstr;
656
1.29k
        *psaveptr = NULL;
657
1.29k
    }
658
2.59k
    if (!start)  /* nothing to do */
659
1.29k
        return NULL;
660
661
        /* First time, scan for the first non-sep character */
662
1.29k
    istart = 0;
663
1.29k
    if (cstr) {
664
1.29k
        for (istart = 0;; istart++) {
665
1.29k
            if ((nextc = start[istart]) == '\0') {
666
0
                *psaveptr = NULL;  /* in case caller doesn't check ret value */
667
0
                return NULL;
668
0
            }
669
1.29k
            if (!strchr(seps, nextc))
670
1.29k
                break;
671
1.29k
        }
672
1.29k
    }
673
674
        /* Scan through, looking for a sep character; if none is
675
         * found, 'i' will be at the end of the string. */
676
11.6k
    for (i = istart;; i++) {
677
11.6k
        if ((nextc = start[i]) == '\0')
678
1.29k
            break;
679
10.3k
        if (strchr(seps, nextc))
680
0
            break;
681
10.3k
    }
682
683
        /* Save the substring */
684
1.29k
    nchars = i - istart;
685
1.29k
    substr = (char *)LEPT_CALLOC(nchars + 1, sizeof(char));
686
1.29k
    stringCopy(substr, start + istart, nchars);
687
688
        /* Look for the next non-sep character.
689
         * If this is the last substring, return a null saveptr. */
690
1.29k
    for (j = i;; j++) {
691
1.29k
        if ((nextc = start[j]) == '\0') {
692
1.29k
            *psaveptr = NULL;  /* no more non-sep characters */
693
1.29k
            break;
694
1.29k
        }
695
0
        if (!strchr(seps, nextc)) {
696
0
            *psaveptr = start + j;  /* start here on next call */
697
0
                break;
698
0
        }
699
0
    }
700
701
1.29k
    return substr;
702
1.29k
}
703
704
705
/*!
706
 * \brief   stringSplitOnToken()
707
 *
708
 * \param[in]    cstr     input string to be split; not altered
709
 * \param[in]    seps     a string of character separators
710
 * \param[out]   phead    ptr to copy of the input string, up to
711
 *                        the first separator token encountered
712
 * \param[out]   ptail    ptr to copy of the part of the input string
713
 *                        starting with the first non-separator character
714
 *                        that occurs after the first separator is found
715
 * \return  0 if OK, 1 on error
716
 *
717
 * <pre>
718
 * Notes:
719
 *      (1) The input string is not altered; all split parts are new strings.
720
 *      (2) The split occurs around the first consecutive sequence of
721
 *          tokens encountered.
722
 *      (3) The head goes from the beginning of the string up to
723
 *          but not including the first token found.
724
 *      (4) The tail contains the second part of the string, starting
725
 *          with the first char in that part that is NOT a token.
726
 *      (5) If no separator token is found, 'head' contains a copy
727
 *          of the input string and 'tail' is null.
728
 * </pre>
729
 */
730
l_ok
731
stringSplitOnToken(char        *cstr,
732
                   const char  *seps,
733
                   char       **phead,
734
                   char       **ptail)
735
0
{
736
0
char  *saveptr;
737
738
0
    if (!phead)
739
0
        return ERROR_INT("&head not defined", __func__, 1);
740
0
    if (!ptail)
741
0
        return ERROR_INT("&tail not defined", __func__, 1);
742
0
    *phead = *ptail = NULL;
743
0
    if (!cstr)
744
0
        return ERROR_INT("cstr not defined", __func__, 1);
745
0
    if (!seps)
746
0
        return ERROR_INT("seps not defined", __func__, 1);
747
748
0
    *phead = strtokSafe(cstr, seps, &saveptr);
749
0
    if (saveptr)
750
0
        *ptail = stringNew(saveptr);
751
0
    return 0;
752
0
}
753
754
755
/*--------------------------------------------------------------------*
756
 *                       Find and replace procs                       *
757
 *--------------------------------------------------------------------*/
758
/*!
759
 * \brief   stringCheckForChars()
760
 *
761
 * \param[in]    src      input string; can be of zero length
762
 * \param[in]    chars    string of chars to be searched for in %src
763
 * \param[out]   pfound   1 if any characters are found; 0 otherwise
764
 * \return  0 if OK, 1 on error
765
 *
766
 * <pre>
767
 * Notes:
768
 *      (1) This can be used to sanitize an operation by checking for
769
 *          special characters that don't belong in a string.
770
 * </pre>
771
 */
772
l_ok
773
stringCheckForChars(const char  *src,
774
                    const char  *chars,
775
                    l_int32     *pfound)
776
0
{
777
0
char     ch;
778
0
l_int32  i, n;
779
780
0
    if (!pfound)
781
0
        return ERROR_INT("&found not defined", __func__, 1);
782
0
    *pfound = FALSE;
783
0
    if (!src || !chars)
784
0
        return ERROR_INT("src and chars not both defined", __func__, 1);
785
786
0
    n = strlen(src);
787
0
    for (i = 0; i < n; i++) {
788
0
        ch = src[i];
789
0
        if (strchr(chars, ch)) {
790
0
            *pfound = TRUE;
791
0
            break;
792
0
        }
793
0
    }
794
0
    return 0;
795
0
}
796
797
798
/*!
799
 * \brief   stringRemoveChars()
800
 *
801
 * \param[in]    src        input string; can be of zero length
802
 * \param[in]    remchars   string of chars to be removed from src
803
 * \return  dest string with specified chars removed, or NULL on error
804
 */
805
char *
806
stringRemoveChars(const char  *src,
807
                  const char  *remchars)
808
1.29k
{
809
1.29k
char     ch;
810
1.29k
char    *dest;
811
1.29k
l_int32  nsrc, i, k;
812
813
1.29k
    if (!src)
814
0
        return (char *)ERROR_PTR("src not defined", __func__, NULL);
815
1.29k
    if (!remchars)
816
0
        return stringNew(src);
817
818
1.29k
    if ((dest = (char *)LEPT_CALLOC(strlen(src) + 1, sizeof(char))) == NULL)
819
0
        return (char *)ERROR_PTR("dest not made", __func__, NULL);
820
1.29k
    nsrc = strlen(src);
821
11.6k
    for (i = 0, k = 0; i < nsrc; i++) {
822
10.3k
        ch = src[i];
823
10.3k
        if (!strchr(remchars, ch))
824
10.3k
            dest[k++] = ch;
825
10.3k
    }
826
827
1.29k
    return dest;
828
1.29k
}
829
830
831
/*!
832
 * \brief   stringReplaceEachSubstr()
833
 *
834
 * \param[in]    src      input string; can be of zero length
835
 * \param[in]    sub1     substring to be replaced
836
 * \param[in]    sub2     substring to put in; can be ""
837
 * \param[out]   pcount   [optional] the number of times that sub1
838
 *                        is found in src; 0 if not found
839
 * \return  dest string with substring replaced, or NULL if the
840
 *              substring not found or on error.
841
 *
842
 * <pre>
843
 * Notes:
844
 *      (1) This is a wrapper for simple string substitution that uses
845
 *          the more general function arrayReplaceEachSequence().
846
 *      (2) This finds every non-overlapping occurrence of %sub1 in
847
 *          %src, and replaces it with %sub2.  By "non-overlapping"
848
 *          we mean that after it finds each match, it removes the
849
 *          matching characters, replaces with the substitution string
850
 *          (if not empty), and continues.  For example, if you replace
851
 *          'aa' by 'X' in 'baaabbb', you find one match at position 1
852
 *          and return 'bXabbb'.
853
 *      (3) To only remove each instance of sub1, use "" for sub2
854
 *      (4) Returns a copy of %src if sub1 and sub2 are the same.
855
 *      (5) If the input %src is binary data that can have null characters,
856
 *          use arrayReplaceEachSequence() directly.
857
 * </pre>
858
 */
859
char *
860
stringReplaceEachSubstr(const char  *src,
861
                        const char  *sub1,
862
                        const char  *sub2,
863
                        l_int32     *pcount)
864
0
{
865
0
size_t  datalen;
866
867
0
    if (pcount) *pcount = 0;
868
0
    if (!src || !sub1 || !sub2)
869
0
        return (char *)ERROR_PTR("src, sub1, sub2 not all defined",
870
0
                                 __func__, NULL);
871
872
0
    if (strlen(sub2) > 0) {
873
0
        return (char *)arrayReplaceEachSequence(
874
0
                               (const l_uint8 *)src, strlen(src),
875
0
                               (const l_uint8 *)sub1, strlen(sub1),
876
0
                               (const l_uint8 *)sub2, strlen(sub2),
877
0
                               &datalen, pcount);
878
0
    } else {  /* empty replacement string; removal only */
879
0
        return (char *)arrayReplaceEachSequence(
880
0
                               (const l_uint8 *)src, strlen(src),
881
0
                               (const l_uint8 *)sub1, strlen(sub1),
882
0
                               NULL, 0, &datalen, pcount);
883
0
    }
884
0
}
885
886
887
/*!
888
 * \brief   stringReplaceSubstr()
889
 *
890
 * \param[in]      src      input string; can be of zero length
891
 * \param[in]      sub1     substring to be replaced
892
 * \param[in]      sub2     substring to put in; can be ""
893
 * \param[in,out]  ploc     [optional] input start location for search;
894
 *                          returns the loc after replacement
895
 * \param[out]     pfound   [optional] 1 if sub1 is found; 0 otherwise
896
 * \return  dest string with substring replaced, or NULL on error.
897
 *
898
 * <pre>
899
 * Notes:
900
 *      (1) Replaces the first instance.
901
 *      (2) To remove sub1 without replacement, use "" for sub2.
902
 *      (3) Returns a copy of %src if either no instance of %sub1 is found,
903
 *          or if %sub1 and %sub2 are the same.
904
 *      (4) If %ploc == NULL, the search will start at the beginning of %src.
905
 *          If %ploc != NULL, *ploc must be initialized to the byte offset
906
 *          within %src from which the search starts.  To search the
907
 *          string from the beginning, set %loc = 0 and input &loc.
908
 *          After finding %sub1 and replacing it with %sub2, %loc will be
909
 *          returned as the next position after %sub2 in the output string.
910
 *      (5) Note that the output string also includes all the characters
911
 *          from the input string that occur after the single substitution.
912
 * </pre>
913
 */
914
char *
915
stringReplaceSubstr(const char  *src,
916
                    const char  *sub1,
917
                    const char  *sub2,
918
                    l_int32     *ploc,
919
                    l_int32     *pfound)
920
0
{
921
0
const char  *ptr;
922
0
char        *dest;
923
0
l_int32      nsrc, nsub1, nsub2, len, npre, loc;
924
925
0
    if (pfound) *pfound = 0;
926
0
    if (!src || !sub1 || !sub2)
927
0
        return (char *)ERROR_PTR("src, sub1, sub2 not all defined",
928
0
                                 __func__, NULL);
929
930
0
    if (ploc)
931
0
        loc = *ploc;
932
0
    else
933
0
        loc = 0;
934
0
    if (!strcmp(sub1, sub2))
935
0
        return stringNew(src);
936
0
    if ((ptr = strstr(src + loc, sub1)) == NULL)
937
0
        return stringNew(src);
938
0
    if (pfound) *pfound = 1;
939
940
0
    nsrc = strlen(src);
941
0
    nsub1 = strlen(sub1);
942
0
    nsub2 = strlen(sub2);
943
0
    len = nsrc + nsub2 - nsub1;
944
0
    if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
945
0
        return (char *)ERROR_PTR("dest not made", __func__, NULL);
946
0
    npre = ptr - src;
947
0
    memcpy(dest, src, npre);
948
0
    strcpy(dest + npre, sub2);
949
0
    strcpy(dest + npre + nsub2, ptr + nsub1);
950
0
    if (ploc) *ploc = npre + nsub2;
951
0
    return dest;
952
0
}
953
954
955
/*!
956
 * \brief   stringFindEachSubstr()
957
 *
958
 * \param[in]    src        input string; can be of zero length
959
 * \param[in]    sub        substring to be searched for
960
 * \return  dna of offsets where the sequence is found, or NULL if
961
 *              none are found or on error
962
 *
963
 * <pre>
964
 * Notes:
965
 *      (1) This finds every non-overlapping occurrence in %src of %sub.
966
 *          After it finds each match, it moves forward in %src by the length
967
 *          of %sub before continuing the search.  So for example,
968
 *          if you search for the sequence 'aa' in the data 'baaabbb',
969
 *          you find one match at position 1.
970
971
 * </pre>
972
 */
973
L_DNA *
974
stringFindEachSubstr(const char  *src,
975
                     const char  *sub)
976
0
{
977
0
    if (!src || !sub)
978
0
        return (L_DNA *)ERROR_PTR("src, sub not both defined", __func__, NULL);
979
980
0
    return arrayFindEachSequence((const l_uint8 *)src, strlen(src),
981
0
                                 (const l_uint8 *)sub, strlen(sub));
982
0
}
983
984
985
/*!
986
 * \brief   stringFindSubstr()
987
 *
988
 * \param[in]    src     input string; can be of zero length
989
 * \param[in]    sub     substring to be searched for; must not be empty
990
 * \param[out]   ploc    [optional] location of substring in src
991
 * \return  1 if found; 0 if not found or on error
992
 *
993
 * <pre>
994
 * Notes:
995
 *      (1) This is a wrapper around strstr().  It finds the first
996
 *          instance of %sub in %src.  If the substring is not found
997
 *          and the location is returned, it has the value -1.
998
 *      (2) Both %src and %sub must be defined, and %sub must have
999
 *          length of at least 1.
1000
 * </pre>
1001
 */
1002
l_int32
1003
stringFindSubstr(const char  *src,
1004
                 const char  *sub,
1005
                 l_int32     *ploc)
1006
0
{
1007
0
const char *ptr;
1008
1009
0
    if (ploc) *ploc = -1;
1010
0
    if (!src || !sub)
1011
0
        return ERROR_INT("src and sub not both defined", __func__, 0);
1012
0
    if (strlen(sub) == 0)
1013
0
        return ERROR_INT("substring length 0", __func__, 0);
1014
0
    if (strlen(src) == 0)
1015
0
        return 0;
1016
1017
0
    if ((ptr = strstr(src, sub)) == NULL)  /* not found */
1018
0
        return 0;
1019
1020
0
    if (ploc)
1021
0
        *ploc = ptr - src;
1022
0
    return 1;
1023
0
}
1024
1025
1026
/*!
1027
 * \brief   arrayReplaceEachSequence()
1028
 *
1029
 * \param[in]    datas       source byte array
1030
 * \param[in]    dataslen    length of source data, in bytes
1031
 * \param[in]    seq         subarray of bytes to find in source data
1032
 * \param[in]    seqlen      length of subarray, in bytes
1033
 * \param[in]    newseq      replacement subarray; can be null
1034
 * \param[in]    newseqlen   length of replacement subarray, in bytes
1035
 * \param[out]   pdatadlen   length of dest byte array, in bytes
1036
 * \param[out]   pcount      [optional] the number of times that sub1
1037
 *                           is found in src; 0 if not found
1038
 * \return  datad   with all all subarrays replaced (or removed)
1039
 *
1040
 * <pre>
1041
 * Notes:
1042
 *      (1) The byte arrays %datas, %seq and %newseq are not C strings,
1043
 *          because they can contain null bytes.  Therefore, for each
1044
 *          we must give the length of the array.
1045
 *      (2) If %newseq == NULL, this just removes all instances of %seq.
1046
 *          Otherwise, it replaces every non-overlapping occurrence of
1047
 *          %seq in %datas with %newseq. A new array %datad and its
1048
 *          size are returned.  See arrayFindEachSequence() for more
1049
 *          details on finding non-overlapping occurrences.
1050
 *      (3) If no instances of %seq are found, this returns a copy of %datas.
1051
 *      (4) The returned %datad is null terminated.
1052
 *      (5) Can use stringReplaceEachSubstr() if using C strings.
1053
 * </pre>
1054
 */
1055
l_uint8 *
1056
arrayReplaceEachSequence(const l_uint8  *datas,
1057
                         size_t          dataslen,
1058
                         const l_uint8  *seq,
1059
                         size_t          seqlen,
1060
                         const l_uint8  *newseq,
1061
                         size_t          newseqlen,
1062
                         size_t         *pdatadlen,
1063
                         l_int32        *pcount)
1064
0
{
1065
0
l_uint8  *datad;
1066
0
size_t    newsize;
1067
0
l_int32   n, i, j, di, si, index, incr;
1068
0
L_DNA    *da;
1069
1070
0
    if (pcount) *pcount = 0;
1071
0
    if (!datas || !seq)
1072
0
        return (l_uint8 *)ERROR_PTR("datas & seq not both defined",
1073
0
                                    __func__, NULL);
1074
0
    if (!pdatadlen)
1075
0
        return (l_uint8 *)ERROR_PTR("&datadlen not defined", __func__, NULL);
1076
0
    *pdatadlen = 0;
1077
1078
        /* Identify the locations of the sequence.  If there are none,
1079
         * return a copy of %datas. */
1080
0
    if ((da = arrayFindEachSequence(datas, dataslen, seq, seqlen)) == NULL) {
1081
0
        *pdatadlen = dataslen;
1082
0
        return l_binaryCopy(datas, dataslen);
1083
0
    }
1084
1085
        /* Allocate the output data; insure null termination */
1086
0
    n = l_dnaGetCount(da);
1087
0
    if (pcount) *pcount = n;
1088
0
    if (!newseq) newseqlen = 0;
1089
0
    newsize = dataslen + n * (newseqlen - seqlen) + 4;
1090
0
    if ((datad = (l_uint8 *)LEPT_CALLOC(newsize, sizeof(l_uint8))) == NULL) {
1091
0
        l_dnaDestroy(&da);
1092
0
        return (l_uint8 *)ERROR_PTR("datad not made", __func__, NULL);
1093
0
    }
1094
1095
        /* Replace each sequence instance with a new sequence */
1096
0
    l_dnaGetIValue(da, 0, &si);
1097
0
    for (i = 0, di = 0, index = 0; i < dataslen; i++) {
1098
0
        if (i == si) {
1099
0
            index++;
1100
0
            if (index < n) {
1101
0
                l_dnaGetIValue(da, index, &si);
1102
0
                incr = L_MIN(seqlen, si - i);  /* amount to remove from datas */
1103
0
            } else {
1104
0
                incr = seqlen;
1105
0
            }
1106
0
            i += incr - 1;  /* jump over the matched sequence in datas */
1107
0
            if (newseq) {  /* add new sequence to datad */
1108
0
                for (j = 0; j < newseqlen; j++)
1109
0
                    datad[di++] = newseq[j];
1110
0
            }
1111
0
        } else {
1112
0
            datad[di++] = datas[i];
1113
0
        }
1114
0
    }
1115
1116
0
    *pdatadlen = di;
1117
0
    l_dnaDestroy(&da);
1118
0
    return datad;
1119
0
}
1120
1121
1122
/*!
1123
 * \brief   arrayFindEachSequence()
1124
 *
1125
 * \param[in]    data       byte array
1126
 * \param[in]    datalen    length of data, in bytes
1127
 * \param[in]    sequence   subarray of bytes to find in data
1128
 * \param[in]    seqlen     length of sequence, in bytes
1129
 * \return  dna of offsets where the sequence is found, or NULL if
1130
 *              none are found or on error
1131
 *
1132
 * <pre>
1133
 * Notes:
1134
 *      (1) The byte arrays %data and %sequence are not C strings,
1135
 *          because they can contain null bytes.  Therefore, for each
1136
 *          we must give the length of the array.
1137
 *      (2) This finds every non-overlapping occurrence in %data of %sequence.
1138
 *          After it finds each match, it moves forward by the length
1139
 *          of the sequence before continuing the search.  So for example,
1140
 *          if you search for the sequence 'aa' in the data 'baaabbb',
1141
 *          you find one match at position 1.
1142
 * </pre>
1143
 */
1144
L_DNA *
1145
arrayFindEachSequence(const l_uint8  *data,
1146
                      size_t          datalen,
1147
                      const l_uint8  *sequence,
1148
                      size_t          seqlen)
1149
0
{
1150
0
l_int32  start, offset, realoffset, found;
1151
0
L_DNA   *da;
1152
1153
0
    if (!data || !sequence)
1154
0
        return (L_DNA *)ERROR_PTR("data & sequence not both defined",
1155
0
                                  __func__, NULL);
1156
1157
0
    da = l_dnaCreate(0);
1158
0
    start = 0;
1159
0
    while (1) {
1160
0
        arrayFindSequence(data + start, datalen - start, sequence, seqlen,
1161
0
                          &offset, &found);
1162
0
        if (found == FALSE)
1163
0
            break;
1164
1165
0
        realoffset = start + offset;
1166
0
        l_dnaAddNumber(da, realoffset);
1167
0
        start = realoffset + seqlen;
1168
0
        if (start >= datalen)
1169
0
            break;
1170
0
    }
1171
1172
0
    if (l_dnaGetCount(da) == 0)
1173
0
        l_dnaDestroy(&da);
1174
0
    return da;
1175
0
}
1176
1177
1178
/*!
1179
 * \brief   arrayFindSequence()
1180
 *
1181
 * \param[in]    data       byte array
1182
 * \param[in]    datalen    length of data, in bytes
1183
 * \param[in]    sequence   subarray of bytes to find in data
1184
 * \param[in]    seqlen     length of sequence, in bytes
1185
 * \param[out]   poffset    offset from beginning of
1186
 *                          data where the sequence begins
1187
 * \param[out]   pfound     1 if sequence is found; 0 otherwise
1188
 * \return  0 if OK, 1 on error
1189
 *
1190
 * <pre>
1191
 * Notes:
1192
 *      (1) The byte arrays 'data' and 'sequence' are in general not C strings,
1193
 *          because they can contain null bytes.  Therefore, for each
1194
 *          we must give the length of the array.
1195
 *      (2) This searches for the first occurrence in %data of %sequence,
1196
 *          which consists of %seqlen bytes.  The parameter %seqlen
1197
 *          must not exceed the actual length of the %sequence byte array.
1198
 *      (3) If either byte array is a C string, cast the array to
1199
 *          (const l_uint8 *) and use strlen() on the string for its length.
1200
 *      (4) If the sequence is not found, the offset will be 0, so you
1201
 *          must check %found.
1202
 * </pre>
1203
 */
1204
l_ok
1205
arrayFindSequence(const l_uint8  *data,
1206
                  size_t          datalen,
1207
                  const l_uint8  *sequence,
1208
                  size_t          seqlen,
1209
                  l_int32        *poffset,
1210
                  l_int32        *pfound)
1211
0
{
1212
0
l_int32  i, j, found, lastpos;
1213
1214
0
    if (poffset) *poffset = 0;
1215
0
    if (pfound) *pfound = FALSE;
1216
0
    if (!data || !sequence)
1217
0
        return ERROR_INT("data & sequence not both defined", __func__, 1);
1218
0
    if (!poffset || !pfound)
1219
0
        return ERROR_INT("&offset and &found not defined", __func__, 1);
1220
1221
0
    lastpos = datalen - seqlen + 1;
1222
0
    found = FALSE;
1223
0
    for (i = 0; i < lastpos; i++) {
1224
0
        for (j = 0; j < seqlen; j++) {
1225
0
            if (data[i + j] != sequence[j])
1226
0
                 break;
1227
0
            if (j == seqlen - 1)
1228
0
                 found = TRUE;
1229
0
        }
1230
0
        if (found == TRUE)
1231
0
            break;
1232
0
    }
1233
1234
0
    if (found == TRUE) {
1235
0
        *poffset = i;
1236
0
        *pfound = TRUE;
1237
0
    }
1238
0
    return 0;
1239
0
}
1240
1241
1242
/*--------------------------------------------------------------------*
1243
 *                             Safe realloc                           *
1244
 *--------------------------------------------------------------------*/
1245
/*!
1246
 * \brief   reallocNew()
1247
 *
1248
 * \param[in,out]  pindata    nulls indata before reallocing
1249
 * \param[in]      oldsize    size of input data to be copied, in bytes
1250
 * \param[in]      newsize    size of buffer to be reallocated in bytes
1251
 * \return  ptr to new data, or NULL on error
1252
 *
1253
 *  Action: !N.B. 3) and (4!
1254
 *      1 Allocates memory, initialized to 0
1255
 *      2 Copies as much of the input data as possible
1256
 *          to the new block, truncating the copy if necessary
1257
 *      3 Frees the input data
1258
 *      4 Zeroes the input data ptr
1259
 *
1260
 * <pre>
1261
 * Notes:
1262
 *      (1) If newsize == 0, frees input data and nulls ptr
1263
 *      (2) If input data is null, only callocs new memory
1264
 *      (3) This differs from realloc in that it always allocates
1265
 *          new memory (if newsize > 0) and initializes it to 0,
1266
 *          it requires the amount of old data to be copied,
1267
 *          and it takes the address of the input ptr and
1268
 *          nulls the handle.
1269
 * </pre>
1270
 */
1271
void *
1272
reallocNew(void  **pindata,
1273
           size_t  oldsize,
1274
           size_t  newsize)
1275
3.42k
{
1276
3.42k
size_t   minsize;
1277
3.42k
void    *indata;
1278
3.42k
void    *newdata;
1279
1280
3.42k
    if (!pindata)
1281
0
        return ERROR_PTR("input data not defined", __func__, NULL);
1282
3.42k
    indata = *pindata;
1283
1284
3.42k
    if (newsize == 0) {   /* nonstandard usage */
1285
0
        if (indata) {
1286
0
            LEPT_FREE(indata);
1287
0
            *pindata = NULL;
1288
0
        }
1289
0
        return NULL;
1290
0
    }
1291
1292
3.42k
    if (!indata) {  /* nonstandard usage */
1293
0
        if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL)
1294
0
            return ERROR_PTR("newdata not made", __func__, NULL);
1295
0
        return newdata;
1296
0
    }
1297
1298
        /* Standard usage */
1299
3.42k
    if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL)
1300
0
        return ERROR_PTR("newdata not made", __func__, NULL);
1301
3.42k
    minsize = L_MIN(oldsize, newsize);
1302
3.42k
    memcpy(newdata, indata, minsize);
1303
3.42k
    LEPT_FREE(indata);
1304
3.42k
    *pindata = NULL;
1305
3.42k
    return newdata;
1306
3.42k
}
1307
1308
1309
/*--------------------------------------------------------------------*
1310
 *                 Read and write between file and memory             *
1311
 *--------------------------------------------------------------------*/
1312
/*!
1313
 * \brief   l_binaryRead()
1314
 *
1315
 * \param[in]    filename
1316
 * \param[out]   pnbytes    number of bytes read
1317
 * \return  data, or NULL on error
1318
 */
1319
l_uint8 *
1320
l_binaryRead(const char  *filename,
1321
             size_t      *pnbytes)
1322
0
{
1323
0
l_uint8  *data;
1324
0
FILE     *fp;
1325
1326
0
    if (!pnbytes)
1327
0
        return (l_uint8 *)ERROR_PTR("pnbytes not defined", __func__, NULL);
1328
0
    *pnbytes = 0;
1329
0
    if (!filename)
1330
0
        return (l_uint8 *)ERROR_PTR("filename not defined", __func__, NULL);
1331
1332
0
    if ((fp = fopenReadStream(filename)) == NULL)
1333
0
        return (l_uint8 *)ERROR_PTR_1("file stream not opened",
1334
0
                                      filename, __func__, NULL);
1335
0
    data = l_binaryReadStream(fp, pnbytes);
1336
0
    fclose(fp);
1337
0
    return data;
1338
0
}
1339
1340
1341
/*!
1342
 * \brief   l_binaryReadStream()
1343
 *
1344
 * \param[in]    fp        file stream opened to read; can be stdin
1345
 * \param[out]   pnbytes   number of bytes read
1346
 * \return  null-terminated array, or NULL on error; reading 0 bytes
1347
 *          is not an error
1348
 *
1349
 * <pre>
1350
 * Notes:
1351
 *      (1) The returned array is terminated with a null byte so that it can
1352
 *          be used to read ascii data from a file into a proper C string.
1353
 *      (2) This can be used to capture data that is piped in via stdin,
1354
 *          because it does not require seeking within the file.
1355
 *      (3) For example, you can read an image from stdin into memory
1356
 *          using shell redirection, with one of these shell commands:
1357
 * \code
1358
 *             cat <imagefile> | readprog
1359
 *             readprog < <imagefile>
1360
 * \endcode
1361
 *          where readprog is:
1362
 * \code
1363
 *             l_uint8 *data = l_binaryReadStream(stdin, &nbytes);
1364
 *             Pix *pix = pixReadMem(data, nbytes);
1365
 * \endcode
1366
 * </pre>
1367
 */
1368
l_uint8 *
1369
l_binaryReadStream(FILE    *fp,
1370
                   size_t  *pnbytes)
1371
0
{
1372
0
l_uint8    *data;
1373
0
l_int32     seekable, navail, nadd, nread;
1374
0
L_BBUFFER  *bb;
1375
1376
0
    if (!pnbytes)
1377
0
        return (l_uint8 *)ERROR_PTR("&nbytes not defined", __func__, NULL);
1378
0
    *pnbytes = 0;
1379
0
    if (!fp)
1380
0
        return (l_uint8 *)ERROR_PTR("fp not defined", __func__, NULL);
1381
1382
        /* Test if the stream is seekable, by attempting to seek to
1383
         * the start of data.  This is a no-op.  If it is seekable, use
1384
         * l_binaryReadSelectStream() to determine the size of the
1385
         * data to be read in advance. */
1386
0
    seekable = (ftell(fp) == 0) ? 1 : 0;
1387
0
    if (seekable)
1388
0
        return l_binaryReadSelectStream(fp, 0, 0, pnbytes);
1389
1390
        /* If it is not seekable, use the bbuffer to realloc memory
1391
         * as needed during reading. */
1392
0
    bb = bbufferCreate(NULL, 4096);
1393
0
    while (1) {
1394
0
        navail = bb->nalloc - bb->n;
1395
0
        if (navail < 4096) {
1396
0
             nadd = L_MAX(bb->nalloc, 4096);
1397
0
             bbufferExtendArray(bb, nadd);
1398
0
        }
1399
0
        nread = fread((void *)(bb->array + bb->n), 1, 4096, fp);
1400
0
        bb->n += nread;
1401
0
        if (nread != 4096) break;
1402
0
    }
1403
1404
        /* Copy the data to a new array sized for the data, because
1405
         * the bbuffer array can be nearly twice the size we need. */
1406
0
    if ((data = (l_uint8 *)LEPT_CALLOC(bb->n + 1, sizeof(l_uint8))) != NULL) {
1407
0
        memcpy(data, bb->array, bb->n);
1408
0
        *pnbytes = bb->n;
1409
0
    } else {
1410
0
        L_ERROR("calloc fail for data\n", __func__);
1411
0
    }
1412
1413
0
    bbufferDestroy(&bb);
1414
0
    return data;
1415
0
}
1416
1417
1418
/*!
1419
 * \brief   l_binaryReadSelect()
1420
 *
1421
 * \param[in]    filename
1422
 * \param[in]    start     first byte to read
1423
 * \param[in]    nbytes    number of bytes to read; use 0 to read to end of file
1424
 * \param[out]   pnread    number of bytes actually read
1425
 * \return  data, or NULL on error
1426
 *
1427
 * <pre>
1428
 * Notes:
1429
 *      (1) The returned array is terminated with a null byte so that it can
1430
 *          be used to read ascii data from a file into a proper C string.
1431
 * </pre>
1432
 */
1433
l_uint8 *
1434
l_binaryReadSelect(const char  *filename,
1435
                   size_t       start,
1436
                   size_t       nbytes,
1437
                   size_t      *pnread)
1438
0
{
1439
0
l_uint8  *data;
1440
0
FILE     *fp;
1441
1442
0
    if (!pnread)
1443
0
        return (l_uint8 *)ERROR_PTR("pnread not defined", __func__, NULL);
1444
0
    *pnread = 0;
1445
0
    if (!filename)
1446
0
        return (l_uint8 *)ERROR_PTR("filename not defined", __func__, NULL);
1447
1448
0
    if ((fp = fopenReadStream(filename)) == NULL)
1449
0
        return (l_uint8 *)ERROR_PTR_1("file stream not opened",
1450
0
                                      filename, __func__, NULL);
1451
0
    data = l_binaryReadSelectStream(fp, start, nbytes, pnread);
1452
0
    fclose(fp);
1453
0
    return data;
1454
0
}
1455
1456
1457
/*!
1458
 * \brief   l_binaryReadSelectStream()
1459
 *
1460
 * \param[in]    fp       file stream
1461
 * \param[in]    start    first byte to read
1462
 * \param[in]    nbytes   number of bytes to read; use 0 to read to end of file
1463
 * \param[out]   pnread   number of bytes actually read
1464
 * \return  null-terminated array, or NULL on error; reading 0 bytes
1465
 *          is not an error
1466
 *
1467
 * <pre>
1468
 * Notes:
1469
 *      (1) The returned array is terminated with a null byte so that it can
1470
 *          be used to read ascii data from a file into a proper C string.
1471
 *          If the file to be read is empty and %start == 0, an array
1472
 *          with a single null byte is returned.
1473
 *      (2) Side effect: the stream pointer is re-positioned to the
1474
 *          beginning of the file.
1475
 * </pre>
1476
 */
1477
l_uint8 *
1478
l_binaryReadSelectStream(FILE    *fp,
1479
                         size_t   start,
1480
                         size_t   nbytes,
1481
                         size_t  *pnread)
1482
0
{
1483
0
l_uint8  *data;
1484
0
size_t    bytesleft, bytestoread, nread, filebytes;
1485
1486
0
    if (!pnread)
1487
0
        return (l_uint8 *)ERROR_PTR("&nread not defined", __func__, NULL);
1488
0
    *pnread = 0;
1489
0
    if (!fp)
1490
0
        return (l_uint8 *)ERROR_PTR("stream not defined", __func__, NULL);
1491
1492
        /* Verify and adjust the parameters if necessary */
1493
0
    fseek(fp, 0, SEEK_END);  /* EOF */
1494
0
    filebytes = ftell(fp);
1495
0
    fseek(fp, 0, SEEK_SET);
1496
0
    if (start > filebytes) {
1497
0
        L_ERROR("start = %zu but filebytes = %zu\n", __func__,
1498
0
                start, filebytes);
1499
0
        return NULL;
1500
0
    }
1501
0
    if (filebytes == 0)  /* start == 0; nothing to read; return null byte */
1502
0
        return (l_uint8 *)LEPT_CALLOC(1, 1);
1503
0
    bytesleft = filebytes - start;  /* greater than 0 */
1504
0
    if (nbytes == 0) nbytes = bytesleft;
1505
0
    bytestoread = (bytesleft >= nbytes) ? nbytes : bytesleft;
1506
1507
        /* Read the data */
1508
0
    if ((data = (l_uint8 *)LEPT_CALLOC(1, bytestoread + 1)) == NULL)
1509
0
        return (l_uint8 *)ERROR_PTR("calloc fail for data", __func__, NULL);
1510
0
    fseek(fp, start, SEEK_SET);
1511
0
    nread = fread(data, 1, bytestoread, fp);
1512
0
    if (nbytes != nread)
1513
0
        L_INFO("%zu bytes requested; %zu bytes read\n", __func__,
1514
0
               nbytes, nread);
1515
0
    *pnread = nread;
1516
0
    fseek(fp, 0, SEEK_SET);
1517
0
    return data;
1518
0
}
1519
1520
1521
/*!
1522
 * \brief   l_binaryWrite()
1523
 *
1524
 * \param[in]    filename     output file
1525
 * \param[in]    operation    "w" for write; "a" for append
1526
 * \param[in]    data         binary data to be written
1527
 * \param[in]    nbytes       size of data array
1528
 * \return  0 if OK; 1 on error
1529
 */
1530
l_ok
1531
l_binaryWrite(const char  *filename,
1532
              const char  *operation,
1533
              const void  *data,
1534
              size_t       nbytes)
1535
0
{
1536
0
char   actualOperation[20];
1537
0
FILE  *fp;
1538
1539
0
    if (!filename)
1540
0
        return ERROR_INT("filename not defined", __func__, 1);
1541
0
    if (!operation)
1542
0
        return ERROR_INT("operation not defined", __func__, 1);
1543
0
    if (!data)
1544
0
        return ERROR_INT("data not defined", __func__, 1);
1545
0
    if (nbytes <= 0)
1546
0
        return ERROR_INT("nbytes must be > 0", __func__, 1);
1547
1548
0
    if (strcmp(operation, "w") && strcmp(operation, "a"))
1549
0
        return ERROR_INT("operation not one of {'w','a'}", __func__, 1);
1550
1551
        /* The 'b' flag to fopen() is ignored for all POSIX
1552
         * conforming systems.  However, Windows needs the 'b' flag. */
1553
0
    stringCopy(actualOperation, operation, 2);
1554
0
    stringCat(actualOperation, 20, "b");
1555
1556
0
    if ((fp = fopenWriteStream(filename, actualOperation)) == NULL)
1557
0
        return ERROR_INT_1("stream not opened", filename, __func__, 1);
1558
0
    fwrite(data, 1, nbytes, fp);
1559
0
    fclose(fp);
1560
0
    return 0;
1561
0
}
1562
1563
1564
/*!
1565
 * \brief   nbytesInFile()
1566
 *
1567
 * \param[in]    filename
1568
 * \return  nbytes in file; 0 on error
1569
 */
1570
size_t
1571
nbytesInFile(const char  *filename)
1572
0
{
1573
0
size_t  nbytes;
1574
0
FILE   *fp;
1575
1576
0
    if (!filename)
1577
0
        return ERROR_INT("filename not defined", __func__, 0);
1578
0
    if ((fp = fopenReadStream(filename)) == NULL)
1579
0
        return ERROR_INT_1("stream not opened", filename, __func__, 0);
1580
0
    nbytes = fnbytesInFile(fp);
1581
0
    fclose(fp);
1582
0
    return nbytes;
1583
0
}
1584
1585
1586
/*!
1587
 * \brief   fnbytesInFile()
1588
 *
1589
 * \param[in]    fp    file stream
1590
 * \return  nbytes in file; 0 on error
1591
 */
1592
size_t
1593
fnbytesInFile(FILE  *fp)
1594
0
{
1595
0
l_int64  pos, nbytes;
1596
1597
0
    if (!fp)
1598
0
        return ERROR_INT("stream not open", __func__, 0);
1599
1600
0
    pos = ftell(fp);          /* initial position */
1601
0
    if (pos < 0)
1602
0
        return ERROR_INT("seek position must be > 0", __func__, 0);
1603
0
    fseek(fp, 0, SEEK_END);   /* EOF */
1604
0
    nbytes = ftell(fp);
1605
0
    if (nbytes < 0)
1606
0
        return ERROR_INT("nbytes is < 0", __func__, 0);
1607
0
    fseek(fp, pos, SEEK_SET);        /* back to initial position */
1608
0
    return nbytes;
1609
0
}
1610
1611
1612
/*--------------------------------------------------------------------*
1613
 *                     Copy and compare in memory                     *
1614
 *--------------------------------------------------------------------*/
1615
/*!
1616
 * \brief   l_binaryCopy()
1617
 *
1618
 * \param[in]    datas
1619
 * \param[in]    size    of data array
1620
 * \return  datad on heap, or NULL on error
1621
 *
1622
 * <pre>
1623
 * Notes:
1624
 *      (1) We add 4 bytes to the zeroed output because in some cases
1625
 *          (e.g., string handling) it is important to have the data
1626
 *          be null terminated.  This guarantees that after the memcpy,
1627
 *          the result is automatically null terminated.
1628
 * </pre>
1629
 */
1630
l_uint8 *
1631
l_binaryCopy(const l_uint8  *datas,
1632
             size_t          size)
1633
0
{
1634
0
l_uint8  *datad;
1635
1636
0
    if (!datas)
1637
0
        return (l_uint8 *)ERROR_PTR("datas not defined", __func__, NULL);
1638
1639
0
    if ((datad = (l_uint8 *)LEPT_CALLOC(size + 4, sizeof(l_uint8))) == NULL)
1640
0
        return (l_uint8 *)ERROR_PTR("datad not made", __func__, NULL);
1641
0
    memcpy(datad, datas, size);
1642
0
    return datad;
1643
0
}
1644
1645
1646
/*!
1647
 * \brief   l_binaryCompare()
1648
 *
1649
 * \param[in]    data1
1650
 * \param[in]    size1   of data1
1651
 * \param[in]    data2
1652
 * \param[in]    size2   of data1
1653
 * \param[out]   psame  (1 if the same, 0 if different)
1654
 * \return  0 if OK, 1 on error
1655
 *
1656
 * <pre>
1657
 * Notes:
1658
 *      (1) This can also be used to compare C strings str1 and str2.
1659
 *          If the string lengths are not known, use strlen():
1660
 *            l_binaryCompare((l_uint8 *)str1, strlen(str1),
1661
                              (l_uint8 *)str2, strlen(str2));
1662
 * </pre>
1663
 */
1664
l_ok
1665
l_binaryCompare(const l_uint8  *data1,
1666
                size_t          size1,
1667
                const l_uint8  *data2,
1668
                size_t          size2,
1669
                l_int32        *psame)
1670
0
{
1671
0
l_int32  i;
1672
1673
0
    if (!psame)
1674
0
        return ERROR_INT("&same not defined", __func__, 1);
1675
0
    *psame = FALSE;
1676
0
    if (!data1 || !data2)
1677
0
        return ERROR_INT("data1 and data2 not both defined", __func__, 1);
1678
0
    if (size1 != size2) return 0;
1679
0
    for (i = 0; i < size1; i++) {
1680
0
        if (data1[i] != data2[i])
1681
0
            return 0;
1682
0
    }
1683
0
    *psame = TRUE;
1684
0
    return 0;
1685
0
}
1686
1687
1688
/*--------------------------------------------------------------------*
1689
 *                         File copy operations                       *
1690
 *--------------------------------------------------------------------*/
1691
/*!
1692
 * \brief   fileCopy()
1693
 *
1694
 * \param[in]    srcfile   copy from this file
1695
 * \param[in]    newfile   copy to this file
1696
 * \return  0 if OK, 1 on error
1697
 */
1698
l_ok
1699
fileCopy(const char  *srcfile,
1700
         const char  *newfile)
1701
0
{
1702
0
l_int32   ret;
1703
0
size_t    nbytes;
1704
0
l_uint8  *data;
1705
1706
0
    if (!srcfile)
1707
0
        return ERROR_INT("srcfile not defined", __func__, 1);
1708
0
    if (!newfile)
1709
0
        return ERROR_INT("newfile not defined", __func__, 1);
1710
1711
0
    if ((data = l_binaryRead(srcfile, &nbytes)) == NULL)
1712
0
        return ERROR_INT("data not returned", __func__, 1);
1713
0
    ret = l_binaryWrite(newfile, "w", data, nbytes);
1714
0
    LEPT_FREE(data);
1715
0
    return ret;
1716
0
}
1717
1718
1719
/*!
1720
 * \brief   fileConcatenate()
1721
 *
1722
 * \param[in]    srcfile   append data from this file
1723
 * \param[in]    destfile  add data to this file
1724
 * \return  0 if OK, 1 on error
1725
 */
1726
l_ok
1727
fileConcatenate(const char  *srcfile,
1728
                const char  *destfile)
1729
0
{
1730
0
size_t    nbytes;
1731
0
l_uint8  *data;
1732
1733
0
    if (!srcfile)
1734
0
        return ERROR_INT("srcfile not defined", __func__, 1);
1735
0
    if (!destfile)
1736
0
        return ERROR_INT("destfile not defined", __func__, 1);
1737
1738
0
    data = l_binaryRead(srcfile, &nbytes);
1739
0
    l_binaryWrite(destfile, "a", data, nbytes);
1740
0
    LEPT_FREE(data);
1741
0
    return 0;
1742
0
}
1743
1744
1745
/*!
1746
 * \brief   fileAppendString()
1747
 *
1748
 * \param[in]    filename
1749
 * \param[in]    str       string to append to file
1750
 * \return  0 if OK, 1 on error
1751
 */
1752
l_ok
1753
fileAppendString(const char  *filename,
1754
                 const char  *str)
1755
0
{
1756
0
FILE  *fp;
1757
1758
0
    if (!filename)
1759
0
        return ERROR_INT("filename not defined", __func__, 1);
1760
0
    if (!str)
1761
0
        return ERROR_INT("str not defined", __func__, 1);
1762
1763
0
    if ((fp = fopenWriteStream(filename, "a")) == NULL)
1764
0
        return ERROR_INT_1("stream not opened", filename, __func__, 1);
1765
0
    fprintf(fp, "%s", str);
1766
0
    fclose(fp);
1767
0
    return 0;
1768
0
}
1769
1770
1771
/*--------------------------------------------------------------------*
1772
 *                         File split operations                      *
1773
 *--------------------------------------------------------------------*/
1774
/*!
1775
 * \brief   fileSplitLinesUniform()
1776
 *
1777
 * \param[in]    filename      input file
1778
 * \param[in]    n             number of output files (>= 1)
1779
 * \param[in]    save_empty    1 to save empty lines; 0 to remove them
1780
 * \param[in]    rootpath      root pathname of output files
1781
 * \param[in]    ext           output extension, including the '.'; can be NULL
1782
 * \return  0 if OK, 1 on error
1783
 *
1784
 * <pre>
1785
 * Notes:
1786
 *      (1) This splits an input text file into %n files with roughly
1787
 *          equal numbers of text lines in each file.
1788
 *      (2) if %save_empty == 1, empty lines are included, and concatention
1789
 *          of the text in the split files will be identical to the original.
1790
 *      (3) The output filenames are in the form:
1791
 *               <rootpath>_N.<ext>, N = 1, ... n
1792
 *      (4) This handles the temp directory pathname conversion where needed:
1793
 *              /tmp  ==>  [OS specific temp directory]
1794
 *      (5) Files can also be sharded into sets of lines by the program 'split':
1795
 *              split -n l/<n> <filename>
1796
 *          Using 'split', the resulting files have approximately equal
1797
 *          numbers of bytes, rather than equal numbers of lines.
1798
 * </pre>
1799
 */
1800
l_ok
1801
fileSplitLinesUniform(const char  *filename,
1802
                      l_int32      n,
1803
                      l_int32      save_empty,
1804
                      const char  *rootpath,
1805
                      const char  *ext)
1806
0
{
1807
0
l_int32   i, totlines, nlines, index;
1808
0
size_t    nbytes;
1809
0
l_uint8  *data;
1810
0
char     *str;
1811
0
char      outname[512];
1812
0
NUMA     *na;
1813
0
SARRAY   *sa;
1814
1815
0
    if (!filename)
1816
0
        return ERROR_INT("filename not defined", __func__, 1);
1817
0
    if (!rootpath)
1818
0
        return ERROR_INT("rootpath not defined", __func__, 1);
1819
0
    if (n <= 0)
1820
0
        return ERROR_INT("n must be > 0", __func__, 1);
1821
0
    if (save_empty != 0 && save_empty != 1)
1822
0
        return ERROR_INT("save_empty not 0 or 1", __func__, 1);
1823
1824
        /* Make sarray of lines; the newlines are stripped off */
1825
0
    if ((data = l_binaryRead(filename, &nbytes)) == NULL)
1826
0
        return ERROR_INT("data not read", __func__, 1);
1827
0
    sa = sarrayCreateLinesFromString((const char *)data, save_empty);
1828
0
    LEPT_FREE(data);
1829
0
    if (!sa)
1830
0
        return ERROR_INT("sa not made", __func__, 1);
1831
0
    totlines = sarrayGetCount(sa);
1832
0
    if (n > totlines) {
1833
0
        sarrayDestroy(&sa);
1834
0
        L_ERROR("num files = %d > num lines = %d\n", __func__, n, totlines);
1835
0
        return 1;
1836
0
    }
1837
1838
        /* Write n sets of lines to n files, adding the newlines back */
1839
0
    na = numaGetUniformBinSizes(totlines, n);
1840
0
    index = 0;
1841
0
    for (i = 0; i < n; i++) {
1842
0
        if (ext == NULL)
1843
0
            snprintf(outname, sizeof(outname), "%s_%d", rootpath, i);
1844
0
        else
1845
0
            snprintf(outname, sizeof(outname), "%s_%d%s", rootpath, i, ext);
1846
0
        numaGetIValue(na, i, &nlines);
1847
0
        str = sarrayToStringRange(sa, index, nlines, 1);  /* add newlines */
1848
0
        l_binaryWrite(outname, "w", str, strlen(str));
1849
0
        LEPT_FREE(str);
1850
0
        index += nlines;
1851
0
    }
1852
0
    numaDestroy(&na);
1853
0
    sarrayDestroy(&sa);
1854
0
    return 0;
1855
0
}
1856
1857
1858
/*--------------------------------------------------------------------*
1859
 *          Multi-platform functions for opening file streams         *
1860
 *--------------------------------------------------------------------*/
1861
/*!
1862
 * \brief   fopenReadStream()
1863
 *
1864
 * \param[in]    filename
1865
 * \return  stream, or NULL on error
1866
 *
1867
 * <pre>
1868
 * Notes:
1869
 *      (1) This should be used whenever you want to run fopen() to
1870
 *          read from a stream.  Never call fopen() directory.
1871
 *      (2) This handles the temp directory pathname conversion where needed:
1872
 *              /tmp  ==>  [OS specific temp directory]
1873
 * </pre>
1874
 */
1875
FILE *
1876
fopenReadStream(const char  *filename)
1877
5.19k
{
1878
5.19k
char  *fname, *tail;
1879
5.19k
FILE  *fp;
1880
1881
5.19k
    if (!filename)
1882
0
        return (FILE *)ERROR_PTR("filename not defined", __func__, NULL);
1883
1884
        /* Try input filename */
1885
5.19k
    fname = genPathname(filename, NULL);
1886
5.19k
    fp = fopen(fname, "rb");
1887
5.19k
    LEPT_FREE(fname);
1888
5.19k
    if (fp) return fp;
1889
1890
        /* Else, strip directory and try locally */
1891
5.19k
    splitPathAtDirectory(filename, NULL, &tail);
1892
5.19k
    if (!tail)
1893
0
        return (FILE*)ERROR_PTR_1("tail not found", filename, __func__, NULL);
1894
5.19k
    fp = fopen(tail, "rb");
1895
5.19k
    if (!fp)
1896
5.19k
        L_ERROR("failed to open locally with tail %s for filename %s\n",
1897
5.19k
                __func__, tail, filename);
1898
5.19k
    LEPT_FREE(tail);
1899
5.19k
    return fp;
1900
5.19k
}
1901
1902
1903
/*!
1904
 * \brief   fopenWriteStream()
1905
 *
1906
 * \param[in]    filename
1907
 * \param[in]    modestring
1908
 * \return  stream, or NULL on error
1909
 *
1910
 * <pre>
1911
 * Notes:
1912
 *      (1) This should be used whenever you want to run fopen() to
1913
 *          write or append to a stream.  Never call fopen() directory.
1914
 *      (2) This handles the temp directory pathname conversion where needed:
1915
 *              /tmp  ==>  [OS specific temp directory]
1916
 * </pre>
1917
 */
1918
FILE *
1919
fopenWriteStream(const char  *filename,
1920
                 const char  *modestring)
1921
0
{
1922
0
char  *fname;
1923
0
FILE  *fp;
1924
1925
0
    if (!filename)
1926
0
        return (FILE *)ERROR_PTR("filename not defined", __func__, NULL);
1927
1928
0
    fname = genPathname(filename, NULL);
1929
0
    fp = fopen(fname, modestring);
1930
0
    if (!fp)
1931
0
        fp = (FILE *)ERROR_PTR_1("stream not opened", fname, __func__, NULL);
1932
0
    LEPT_FREE(fname);
1933
0
    return fp;
1934
0
}
1935
1936
1937
/*!
1938
 * \brief   fopenReadFromMemory()
1939
 *
1940
 * \param[in]    data, size
1941
 * \return  file stream, or NULL on error
1942
 *
1943
 * <pre>
1944
 * Notes:
1945
 *      (1) Work-around if fmemopen() not available.
1946
 *      (2) Windows tmpfile() writes into the root C:\ directory, which
1947
 *          requires admin privileges.  This also works around that.
1948
 * </pre>
1949
 */
1950
FILE *
1951
fopenReadFromMemory(const l_uint8  *data,
1952
                    size_t          size)
1953
0
{
1954
0
FILE  *fp;
1955
1956
0
    if (!data)
1957
0
        return (FILE *)ERROR_PTR("data not defined", __func__, NULL);
1958
1959
0
#if HAVE_FMEMOPEN
1960
0
    if ((fp = fmemopen((void *)data, size, "rb")) == NULL)
1961
0
        return (FILE *)ERROR_PTR("stream not opened", __func__, NULL);
1962
#else  /* write to tmp file */
1963
    L_INFO("no fmemopen API --> work-around: write to temp file\n", __func__);
1964
  #ifdef _WIN32
1965
    if ((fp = fopenWriteWinTempfile()) == NULL)
1966
        return (FILE *)ERROR_PTR("tmpfile stream not opened", __func__, NULL);
1967
  #else
1968
    if ((fp = tmpfile()) == NULL)
1969
        return (FILE *)ERROR_PTR("tmpfile stream not opened", __func__, NULL);
1970
  #endif  /*  _WIN32 */
1971
    fwrite(data, 1, size, fp);
1972
    rewind(fp);
1973
#endif  /* HAVE_FMEMOPEN */
1974
1975
0
    return fp;
1976
0
}
1977
1978
1979
/*--------------------------------------------------------------------*
1980
 *                Opening a Windows tmpfile for writing               *
1981
 *--------------------------------------------------------------------*/
1982
/*!
1983
 * \brief   fopenWriteWinTempfile()
1984
 *
1985
 * \return  file stream, or NULL on error
1986
 *
1987
 * <pre>
1988
 * Notes:
1989
 *      (1) The Windows version of tmpfile() writes into the root
1990
 *          C:\ directory, which requires admin privileges.  This
1991
 *          function provides an alternative implementation.
1992
 * </pre>
1993
 */
1994
FILE *
1995
fopenWriteWinTempfile(void)
1996
0
{
1997
#ifdef _WIN32
1998
l_int32  handle;
1999
FILE    *fp;
2000
char    *filename;
2001
2002
    if ((filename = l_makeTempFilename()) == NULL) {
2003
        L_ERROR("l_makeTempFilename failed, %s\n", __func__, strerror(errno));
2004
        return NULL;
2005
    }
2006
2007
    handle = _open(filename, _O_CREAT | _O_RDWR | _O_SHORT_LIVED |
2008
                   _O_TEMPORARY | _O_BINARY, _S_IREAD | _S_IWRITE);
2009
    lept_free(filename);
2010
    if (handle == -1) {
2011
        L_ERROR("_open failed, %s\n", __func__, strerror(errno));
2012
        return NULL;
2013
    }
2014
2015
    if ((fp = _fdopen(handle, "r+b")) == NULL) {
2016
        L_ERROR("_fdopen failed, %s\n", __func__, strerror(errno));
2017
        return NULL;
2018
    }
2019
2020
    return fp;
2021
#else
2022
0
    return NULL;
2023
0
#endif  /*  _WIN32 */
2024
0
}
2025
2026
2027
/*--------------------------------------------------------------------*
2028
 *       Multi-platform functions that avoid C-runtime boundary       *
2029
 *             crossing for applications with Windows DLLs            *
2030
 *--------------------------------------------------------------------*/
2031
/*
2032
 *  Problems arise when pointers to streams and data are passed
2033
 *  between two Windows DLLs that have been generated with different
2034
 *  C runtimes.  To avoid this, leptonica provides wrappers for
2035
 *  several C library calls.
2036
 */
2037
/*!
2038
 * \brief   lept_fopen()
2039
 *
2040
 * \param[in]    filename
2041
 * \param[in]    mode       same as for fopen(); e.g., "rb"
2042
 * \return  stream or NULL on error
2043
 *
2044
 * <pre>
2045
 * Notes:
2046
 *      (1) This must be used by any application that passes
2047
 *          a file handle to a leptonica Windows DLL.
2048
 * </pre>
2049
 */
2050
FILE *
2051
lept_fopen(const char  *filename,
2052
           const char  *mode)
2053
0
{
2054
0
    if (!filename)
2055
0
        return (FILE *)ERROR_PTR("filename not defined", __func__, NULL);
2056
0
    if (!mode)
2057
0
        return (FILE *)ERROR_PTR("mode not defined", __func__, NULL);
2058
2059
0
    if (stringFindSubstr(mode, "r", NULL))
2060
0
        return fopenReadStream(filename);
2061
0
    else
2062
0
        return fopenWriteStream(filename, mode);
2063
0
}
2064
2065
2066
/*!
2067
 * \brief   lept_fclose()
2068
 *
2069
 * \param[in]    fp    file stream
2070
 * \return  0 if OK, 1 on error
2071
 *
2072
 * <pre>
2073
 * Notes:
2074
 *      (1) This should be used by any application that accepts
2075
 *          a file handle generated by a leptonica Windows DLL.
2076
 * </pre>
2077
 */
2078
l_ok
2079
lept_fclose(FILE *fp)
2080
0
{
2081
0
    if (!fp)
2082
0
        return ERROR_INT("stream not defined", __func__, 1);
2083
2084
0
    return fclose(fp);
2085
0
}
2086
2087
2088
/*!
2089
 * \brief   lept_calloc()
2090
 *
2091
 * \param[in]    nmemb    number of members
2092
 * \param[in]    size     of each member
2093
 * \return  void ptr, or NULL on error
2094
 *
2095
 * <pre>
2096
 * Notes:
2097
 *      (1) For safety with Windows DLLs, this can be used in conjunction
2098
 *          with lept_free() to avoid C-runtime boundary problems.
2099
 *          Just use these two functions throughout your application.
2100
 * </pre>
2101
 */
2102
void *
2103
lept_calloc(size_t  nmemb,
2104
            size_t  size)
2105
0
{
2106
0
    if (nmemb <= 0 || size <= 0)
2107
0
        return NULL;
2108
0
    return LEPT_CALLOC(nmemb, size);
2109
0
}
2110
2111
2112
/*!
2113
 * \brief   lept_free()
2114
 *
2115
 * \param[in]    ptr
2116
 *
2117
 * <pre>
2118
 * Notes:
2119
 *      (1) This should be used by any application that accepts
2120
 *          heap data allocated by a leptonica Windows DLL.
2121
 * </pre>
2122
 */
2123
void
2124
lept_free(void *ptr)
2125
0
{
2126
0
    if (!ptr) return;
2127
0
    LEPT_FREE(ptr);
2128
0
}
2129
2130
2131
/*--------------------------------------------------------------------*
2132
 *                Multi-platform file system operations               *
2133
 *         [ These only write to /tmp or its subdirectories ]         *
2134
 *--------------------------------------------------------------------*/
2135
/*!
2136
 * \brief   lept_mkdir()
2137
 *
2138
 * \param[in]    subdir    of /tmp or its OS specific equivalent
2139
 * \return  0 on success, non-zero on failure
2140
 *
2141
 * <pre>
2142
 * Notes:
2143
 *      (1) %subdir is a partial path that can consist of one or more
2144
 *          directories.
2145
 *      (2) This makes any subdirectories of /tmp that are required.
2146
 *      (3) The root temp directory is:
2147
 *            /tmp    (unix)  [default]
2148
 *            [Temp]  (Windows)
2149
 * </pre>
2150
 */
2151
l_int32
2152
lept_mkdir(const char  *subdir)
2153
0
{
2154
0
char     *dir, *tmpdir;
2155
0
l_int32   i, n;
2156
0
l_int32   ret = 0;
2157
0
SARRAY   *sa;
2158
#ifdef  _WIN32
2159
l_uint32  attributes;
2160
#endif  /* _WIN32 */
2161
2162
0
    if (!LeptDebugOK) {
2163
0
        L_INFO("making named temp subdirectory %s is disabled\n",
2164
0
               __func__, subdir);
2165
0
        return 0;
2166
0
    }
2167
2168
0
    if (!subdir)
2169
0
        return ERROR_INT("subdir not defined", __func__, 1);
2170
0
    if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/'))
2171
0
        return ERROR_INT("subdir not an actual subdirectory", __func__, 1);
2172
2173
0
    sa = sarrayCreate(0);
2174
0
    sarraySplitString(sa, subdir, "/");
2175
0
    n = sarrayGetCount(sa);
2176
0
    dir = genPathname("/tmp", NULL);
2177
       /* Make sure the tmp directory exists */
2178
0
#ifndef _WIN32
2179
0
    ret = mkdir(dir, 0777);
2180
#else
2181
    attributes = GetFileAttributesA(dir);
2182
    if (attributes == INVALID_FILE_ATTRIBUTES)
2183
        ret = (CreateDirectoryA(dir, NULL) ? 0 : 1);
2184
#endif
2185
        /* Make all the subdirectories */
2186
0
    for (i = 0; i < n; i++) {
2187
0
        tmpdir = pathJoin(dir, sarrayGetString(sa, i, L_NOCOPY));
2188
0
#ifndef _WIN32
2189
0
        ret += mkdir(tmpdir, 0777);
2190
#else
2191
        if (CreateDirectoryA(tmpdir, NULL) == 0)
2192
            ret += (GetLastError() != ERROR_ALREADY_EXISTS);
2193
#endif
2194
0
        LEPT_FREE(dir);
2195
0
        dir = tmpdir;
2196
0
    }
2197
0
    LEPT_FREE(dir);
2198
0
    sarrayDestroy(&sa);
2199
0
    if (ret > 0)
2200
0
        L_ERROR("failure to create %d directories\n", __func__, ret);
2201
0
    return ret;
2202
0
}
2203
2204
2205
/*!
2206
 * \brief   lept_rmdir()
2207
 *
2208
 * \param[in]    subdir    of /tmp or its OS specific equivalent
2209
 * \return  0 on success, non-zero on failure
2210
 *
2211
 * <pre>
2212
 * Notes:
2213
 *      (1) %subdir is a partial path that can consist of one or more
2214
 *          directories.
2215
 *      (2) This removes all files from the specified subdirectory of
2216
 *          the root temp directory:
2217
 *            /tmp    (unix)
2218
 *            [Temp]  (Windows)
2219
 *          and then removes the subdirectory.
2220
 *      (3) The combination
2221
 *            lept_rmdir(subdir);
2222
 *            lept_mkdir(subdir);
2223
 *          is guaranteed to give you an empty subdirectory.
2224
 * </pre>
2225
 */
2226
l_int32
2227
lept_rmdir(const char  *subdir)
2228
0
{
2229
0
char    *dir, *fname, *fullname;
2230
0
l_int32  exists, ret, i, nfiles;
2231
0
SARRAY  *sa;
2232
#ifdef _WIN32
2233
char    *newpath;
2234
#else
2235
0
char    *realdir;
2236
0
#endif  /* _WIN32 */
2237
2238
0
    if (!subdir)
2239
0
        return ERROR_INT("subdir not defined", __func__, 1);
2240
0
    if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/'))
2241
0
        return ERROR_INT("subdir not an actual subdirectory", __func__, 1);
2242
2243
        /* Find the temp subdirectory */
2244
0
    dir = pathJoin("/tmp", subdir);
2245
0
    if (!dir)
2246
0
        return ERROR_INT("directory name not made", __func__, 1);
2247
0
    lept_direxists(dir, &exists);
2248
0
    if (!exists) {  /* fail silently */
2249
0
        LEPT_FREE(dir);
2250
0
        return 0;
2251
0
    }
2252
2253
        /* List all the files in that directory */
2254
0
    if ((sa = getFilenamesInDirectory(dir)) == NULL) {
2255
0
        L_ERROR("directory %s does not exist!\n", __func__, dir);
2256
0
        LEPT_FREE(dir);
2257
0
        return 1;
2258
0
    }
2259
0
    nfiles = sarrayGetCount(sa);
2260
2261
0
    for (i = 0; i < nfiles; i++) {
2262
0
        fname = sarrayGetString(sa, i, L_NOCOPY);
2263
0
        fullname = genPathname(dir, fname);
2264
0
        remove(fullname);
2265
0
        LEPT_FREE(fullname);
2266
0
    }
2267
2268
0
#ifndef _WIN32
2269
0
    realdir = genPathname("/tmp", subdir);
2270
0
    ret = rmdir(realdir);
2271
0
    LEPT_FREE(realdir);
2272
#else
2273
    newpath = genPathname(dir, NULL);
2274
    ret = (RemoveDirectoryA(newpath) ? 0 : 1);
2275
    LEPT_FREE(newpath);
2276
#endif  /* !_WIN32 */
2277
2278
0
    sarrayDestroy(&sa);
2279
0
    LEPT_FREE(dir);
2280
0
    return ret;
2281
0
}
2282
2283
2284
/*!
2285
 * \brief   lept_direxists()
2286
 *
2287
 * \param[in]    dir
2288
 * \param[out]   pexists    1 if it exists; 0 otherwise
2289
 * \return  void
2290
 *
2291
 * <pre>
2292
 * Notes:
2293
 *      (1) Always use unix pathname separators.
2294
 *      (2) By calling genPathname(), if the pathname begins with "/tmp"
2295
 *          this does an automatic directory translation for operating
2296
 *          systems that use a different path for /tmp.
2297
 * </pre>
2298
 */
2299
void
2300
lept_direxists(const char  *dir,
2301
               l_int32     *pexists)
2302
0
{
2303
0
char  *realdir;
2304
2305
0
    if (!pexists) return;
2306
0
    *pexists = 0;
2307
0
    if (!dir) return;
2308
0
    if ((realdir = genPathname(dir, NULL)) == NULL)
2309
0
        return;
2310
2311
0
#ifndef _WIN32
2312
0
    {
2313
0
    struct stat s;
2314
0
    l_int32 err = stat(realdir, &s);
2315
0
    if (err != -1 && S_ISDIR(s.st_mode))
2316
0
        *pexists = 1;
2317
0
    }
2318
#else  /* _WIN32 */
2319
    {
2320
    l_uint32  attributes;
2321
    attributes = GetFileAttributesA(realdir);
2322
    if (attributes != INVALID_FILE_ATTRIBUTES &&
2323
        (attributes & FILE_ATTRIBUTE_DIRECTORY))
2324
        *pexists = 1;
2325
    }
2326
#endif  /* _WIN32 */
2327
2328
0
    LEPT_FREE(realdir);
2329
0
}
2330
2331
2332
/*!
2333
 * \brief   lept_rm_match()
2334
 *
2335
 * \param[in]    subdir    [optional] if NULL, the removed files are in /tmp
2336
 * \param[in]    substr    [optional] pattern to match in filename
2337
 * \return  0 on success, non-zero on failure
2338
 *
2339
 * <pre>
2340
 * Notes:
2341
 *      (1) This removes the matched files in /tmp or a subdirectory of /tmp.
2342
 *          Use NULL for %subdir if the files are in /tmp.
2343
 *      (2) If %substr == NULL, this removes all files in the directory.
2344
 *          If %substr == "" (empty), this removes no files.
2345
 *          If both %subdir == NULL and %substr == NULL, this removes
2346
 *          all files in /tmp.
2347
 *      (3) Use unix pathname separators.
2348
 *      (4) By calling genPathname(), if the pathname begins with "/tmp"
2349
 *          this does an automatic directory translation for operating
2350
 *          systems that use a different path for /tmp.
2351
 *      (5) Error conditions:
2352
 *            * returns -1 if the directory is not found
2353
 *            * returns the number of files (> 0) that it was unable to remove.
2354
 * </pre>
2355
 */
2356
l_int32
2357
lept_rm_match(const char  *subdir,
2358
              const char  *substr)
2359
0
{
2360
0
char    *path, *fname;
2361
0
char     tempdir[256];
2362
0
l_int32  i, n, ret;
2363
0
SARRAY  *sa;
2364
2365
0
    makeTempDirname(tempdir, sizeof(tempdir), subdir);
2366
0
    if ((sa = getSortedPathnamesInDirectory(tempdir, substr, 0, 0)) == NULL)
2367
0
        return ERROR_INT("sa not made", __func__, -1);
2368
0
    n = sarrayGetCount(sa);
2369
0
    if (n == 0) {
2370
0
        L_WARNING("no matching files found\n", __func__);
2371
0
        sarrayDestroy(&sa);
2372
0
        return 0;
2373
0
    }
2374
2375
0
    ret = 0;
2376
0
    for (i = 0; i < n; i++) {
2377
0
        fname = sarrayGetString(sa, i, L_NOCOPY);
2378
0
        path = genPathname(fname, NULL);
2379
0
        if (lept_rmfile(path) != 0) {
2380
0
            L_ERROR("failed to remove %s\n", __func__, path);
2381
0
            ret++;
2382
0
        }
2383
0
        LEPT_FREE(path);
2384
0
    }
2385
0
    sarrayDestroy(&sa);
2386
0
    return ret;
2387
0
}
2388
2389
2390
/*!
2391
 * \brief   lept_rm()
2392
 *
2393
 * \param[in]    subdir    [optional] subdir of '/tmp'; can be NULL
2394
 * \param[in]    tail      filename without the directory
2395
 * \return  0 on success, non-zero on failure
2396
 *
2397
 * <pre>
2398
 * Notes:
2399
 *      (1) By calling genPathname(), this does an automatic directory
2400
 *          translation on operating systems which use a different path.
2401
 * </pre>
2402
 */
2403
l_int32
2404
lept_rm(const char  *subdir,
2405
        const char  *tail)
2406
0
{
2407
0
char    *path;
2408
0
char     newtemp[256];
2409
0
l_int32  ret;
2410
2411
0
    if (!tail || strlen(tail) == 0)
2412
0
        return ERROR_INT("tail undefined or empty", __func__, 1);
2413
2414
0
    if (makeTempDirname(newtemp, sizeof(newtemp), subdir))
2415
0
        return ERROR_INT("temp dirname not made", __func__, 1);
2416
0
    path = genPathname(newtemp, tail);
2417
0
    ret = lept_rmfile(path);
2418
0
    LEPT_FREE(path);
2419
0
    return ret;
2420
0
}
2421
2422
2423
/*!
2424
 * \brief
2425
 *
2426
 *  lept_rmfile()
2427
 *
2428
 * \param[in]    filepath     full path to file including the directory
2429
 * \return  0 on success, non-zero on failure
2430
 *
2431
 * <pre>
2432
 * Notes:
2433
 *      (1) This removes the named file.
2434
 *      (2) Use unix pathname separators.
2435
 *      (3) There is no name translation.
2436
 *      (4) Unlike the other lept_* functions in this section, this can remove
2437
 *          any file -- it is not restricted to files that are in /tmp or a
2438
 *          subdirectory of it.
2439
 *      (5) For files in /tmp or a subdirectory of it, this does an automatic
2440
 *          directory translation for operating systems that use a different
2441
 *          path for /tmp.
2442
 * </pre>
2443
 */
2444
l_int32
2445
lept_rmfile(const char  *filepath)
2446
0
{
2447
0
l_int32  ret;
2448
2449
0
    if (!filepath || strlen(filepath) == 0)
2450
0
        return ERROR_INT("filepath undefined or empty", __func__, 1);
2451
2452
0
#ifndef _WIN32
2453
0
    ret = remove(filepath);
2454
#else
2455
        /* Set attributes to allow deletion of read-only files */
2456
    SetFileAttributesA(filepath, FILE_ATTRIBUTE_NORMAL);
2457
    ret = DeleteFileA(filepath) ? 0 : 1;
2458
#endif  /* !_WIN32 */
2459
2460
0
    return ret;
2461
0
}
2462
2463
2464
/*!
2465
 * \brief   lept_mv()
2466
 *
2467
 * \param[in]    srcfile
2468
 * \param[in]    newdir     [optional]; can be NULL
2469
 * \param[in]    newtail    [optional]; can be NULL
2470
 * \param[out]   pnewpath   [optional] of actual path; can be NULL
2471
 * \return  0 on success, non-zero on failure
2472
 *
2473
 * <pre>
2474
 * Notes:
2475
 *      (1) This moves %srcfile to /tmp or to a subdirectory of /tmp.
2476
 *      (2) %srcfile can either be a full path or relative to the
2477
 *          current directory.
2478
 *      (3) %newdir can either specify an existing subdirectory of /tmp
2479
 *          or can be NULL.  In the latter case, the file will be written
2480
 *          into /tmp.
2481
 *      (4) %newtail can either specify a filename tail or, if NULL,
2482
 *          the filename is taken from src-tail, the tail of %srcfile.
2483
 *      (5) For debugging, the computed newpath can be returned.  It must
2484
 *          be freed by the caller.
2485
 *      (6) Reminders:
2486
 *          (a) specify files using unix pathnames
2487
 *          (b) this does an automatic directory translation on operating
2488
 *              systems that use a different path for /tmp.
2489
 *      (7) Examples:
2490
 *          * newdir = NULL,    newtail = NULL    ==> /tmp/src-tail
2491
 *          * newdir = NULL,    newtail = abc     ==> /tmp/abc
2492
 *          * newdir = def/ghi, newtail = NULL    ==> /tmp/def/ghi/src-tail
2493
 *          * newdir = def/ghi, newtail = abc     ==> /tmp/def/ghi/abc
2494
 * </pre>
2495
 */
2496
l_int32
2497
lept_mv(const char  *srcfile,
2498
        const char  *newdir,
2499
        const char  *newtail,
2500
        char       **pnewpath)
2501
0
{
2502
0
char    *srcpath, *newpath, *dir, *srctail;
2503
0
char     newtemp[256];
2504
0
l_int32  ret;
2505
2506
0
    if (!srcfile)
2507
0
        return ERROR_INT("srcfile not defined", __func__, 1);
2508
2509
        /* Require output pathname to be in /tmp/ or a subdirectory */
2510
0
    if (makeTempDirname(newtemp, sizeof(newtemp), newdir) == 1)
2511
0
        return ERROR_INT("newdir not NULL or a subdir of /tmp", __func__, 1);
2512
2513
        /* Get canonical src pathname */
2514
0
    splitPathAtDirectory(srcfile, &dir, &srctail);
2515
2516
0
#ifndef _WIN32
2517
0
    srcpath = pathJoin(dir, srctail);
2518
0
    LEPT_FREE(dir);
2519
2520
        /* Generate output pathname */
2521
0
    if (!newtail || newtail[0] == '\0')
2522
0
        newpath = pathJoin(newtemp, srctail);
2523
0
    else
2524
0
        newpath = pathJoin(newtemp, newtail);
2525
0
    LEPT_FREE(srctail);
2526
2527
        /* Overwrite any existing file at 'newpath' */
2528
0
    ret = fileCopy(srcpath, newpath);
2529
0
    if (!ret) {  /* and remove srcfile */
2530
0
        char *realpath = genPathname(srcpath, NULL);
2531
0
        remove(realpath);
2532
0
        LEPT_FREE(realpath);
2533
0
    }
2534
#else
2535
    srcpath = genPathname(dir, srctail);
2536
    LEPT_FREE(dir);
2537
2538
        /* Generate output pathname */
2539
    if (!newtail || newtail[0] == '\0')
2540
        newpath = genPathname(newtemp, srctail);
2541
    else
2542
        newpath = genPathname(newtemp, newtail);
2543
    LEPT_FREE(srctail);
2544
2545
        /* Overwrite any existing file at 'newpath' */
2546
    ret = MoveFileExA(srcpath, newpath,
2547
                     MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING) ? 0 : 1;
2548
#endif  /* ! _WIN32 */
2549
2550
0
    LEPT_FREE(srcpath);
2551
0
    if (pnewpath)
2552
0
        *pnewpath = newpath;
2553
0
    else
2554
0
        LEPT_FREE(newpath);
2555
0
    return ret;
2556
0
}
2557
2558
2559
/*!
2560
 * \brief   lept_cp()
2561
 *
2562
 * \param[in]    srcfile
2563
 * \param[in]    newdir    [optional]; can be NULL
2564
 * \param[in]    newtail   [optional]; can be NULL
2565
 * \param[out]   pnewpath  [optional] of actual path; can be NULL
2566
 * \return  0 on success, non-zero on failure
2567
 *
2568
 * <pre>
2569
 * Notes:
2570
 *      (1) This copies %srcfile to /tmp or to a subdirectory of /tmp.
2571
 *      (2) %srcfile can either be a full path or relative to the
2572
 *          current directory.
2573
 *      (3) %newdir can either specify an existing subdirectory of /tmp,
2574
 *          or can be NULL.  In the latter case, the file will be written
2575
 *          into /tmp.
2576
 *      (4) %newtail can either specify a filename tail or, if NULL,
2577
 *          the filename is taken from src-tail, the tail of %srcfile.
2578
 *      (5) For debugging, the computed newpath can be returned.  It must
2579
 *          be freed by the caller.
2580
 *      (6) Reminders:
2581
 *          (a) specify files using unix pathnames
2582
 *          (b) this does an automatic directory translation for operating
2583
 *              systems that use a different path for /tmp
2584
 *      (7) Examples:
2585
 *          * newdir = NULL,    newtail = NULL    ==> /tmp/src-tail
2586
 *          * newdir = NULL,    newtail = abc     ==> /tmp/abc
2587
 *          * newdir = def/ghi, newtail = NULL    ==> /tmp/def/ghi/src-tail
2588
 *          * newdir = def/ghi, newtail = abc     ==> /tmp/def/ghi/abc
2589
 *
2590
 * </pre>
2591
 */
2592
l_int32
2593
lept_cp(const char  *srcfile,
2594
        const char  *newdir,
2595
        const char  *newtail,
2596
        char       **pnewpath)
2597
0
{
2598
0
char    *srcpath, *newpath, *dir, *srctail;
2599
0
char     newtemp[256];
2600
0
l_int32  ret;
2601
2602
0
    if (!srcfile)
2603
0
        return ERROR_INT("srcfile not defined", __func__, 1);
2604
2605
        /* Require output pathname to be in /tmp or a subdirectory */
2606
0
    if (makeTempDirname(newtemp, sizeof(newtemp), newdir) == 1)
2607
0
        return ERROR_INT("newdir not NULL or a subdir of /tmp", __func__, 1);
2608
2609
       /* Get canonical src pathname */
2610
0
    splitPathAtDirectory(srcfile, &dir, &srctail);
2611
2612
0
#ifndef _WIN32
2613
0
    srcpath = pathJoin(dir, srctail);
2614
0
    LEPT_FREE(dir);
2615
2616
        /* Generate output pathname */
2617
0
    if (!newtail || newtail[0] == '\0')
2618
0
        newpath = pathJoin(newtemp, srctail);
2619
0
    else
2620
0
        newpath = pathJoin(newtemp, newtail);
2621
0
    LEPT_FREE(srctail);
2622
2623
        /* Overwrite any existing file at 'newpath' */
2624
0
    ret = fileCopy(srcpath, newpath);
2625
#else
2626
    srcpath = genPathname(dir, srctail);
2627
    LEPT_FREE(dir);
2628
2629
        /* Generate output pathname */
2630
    if (!newtail || newtail[0] == '\0')
2631
        newpath = genPathname(newtemp, srctail);
2632
    else
2633
        newpath = genPathname(newtemp, newtail);
2634
    LEPT_FREE(srctail);
2635
2636
        /* Overwrite any existing file at 'newpath' */
2637
    ret = CopyFileA(srcpath, newpath, FALSE) ? 0 : 1;
2638
#endif   /* !_WIN32 */
2639
2640
0
    LEPT_FREE(srcpath);
2641
0
    if (pnewpath)
2642
0
        *pnewpath = newpath;
2643
0
    else
2644
0
        LEPT_FREE(newpath);
2645
0
    return ret;
2646
0
}
2647
2648
2649
/*--------------------------------------------------------------------*
2650
 *          Special debug/test function for calling 'system'          *
2651
 *--------------------------------------------------------------------*/
2652
#if defined(__APPLE__)
2653
  #include "TargetConditionals.h"
2654
#endif  /* __APPLE__ */
2655
2656
/*!
2657
 * \brief   callSystemDebug()
2658
 *
2659
 * \param[in]    cmd      command to be exec'd
2660
 * \return  0 on success
2661
 *
2662
 * <pre>
2663
 * Notes:
2664
 *      (1) The C library 'system' call is only made through this function.
2665
 *          It only works in debug/test mode, where the global variable
2666
 *          LeptDebugOK == TRUE.  This variable is set to FALSE in the
2667
 *          library as distributed, and calling this function will
2668
 *          generate an error message.
2669
 * </pre>
2670
 */
2671
l_int32
2672
callSystemDebug(const char *cmd)
2673
0
{
2674
0
l_int32  ret;
2675
2676
0
    if (!cmd) {
2677
0
        L_ERROR("cmd not defined\n", __func__);
2678
0
        return 1;
2679
0
    }
2680
0
    if (LeptDebugOK == FALSE) {
2681
0
        L_INFO("'system' calls are disabled\n", __func__);
2682
0
        return 1;
2683
0
    }
2684
2685
#if defined(__APPLE__)  /* iOS 11 does not support system() */
2686
2687
  #if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1)  /* Mac OS X */
2688
    ret = system(cmd);
2689
  #elif TARGET_OS_IPHONE || defined(OS_IOS)  /* iOS */
2690
    L_ERROR("iOS 11 does not support system()\n", __func__);
2691
  #endif  /* TARGET_OS_OSX */
2692
2693
#else /* ! __APPLE__ */
2694
2695
0
   ret = system(cmd);
2696
2697
0
#endif /* __APPLE__ */
2698
2699
0
   return ret;
2700
0
}
2701
2702
2703
/*--------------------------------------------------------------------*
2704
 *                     General file name operations                   *
2705
 *--------------------------------------------------------------------*/
2706
/*!
2707
 * \brief   splitPathAtDirectory()
2708
 *
2709
 * \param[in]    pathname  full path; can be a directory
2710
 * \param[out]   pdir      [optional] root directory name of
2711
 *                         input path, including trailing '/'
2712
 * \param[out]   ptail     [optional] path tail, which is either
2713
 *                         the file name within the root directory or
2714
 *                         the last sub-directory in the path
2715
 * \return  0 if OK, 1 on error
2716
 *
2717
 * <pre>
2718
 * Notes:
2719
 *      (1) If you only want the tail, input null for the root directory ptr.
2720
 *      (2) If you only want the root directory name, input null for the
2721
 *          tail ptr.
2722
 *      (3) This function makes decisions based only on the lexical
2723
 *          structure of the input.  Examples:
2724
 *            /usr/tmp/abc.d  -->  dir: /usr/tmp/       tail: abc.d
2725
 *            /usr/tmp/       -->  dir: /usr/tmp/       tail: [empty string]
2726
 *            /usr/tmp        -->  dir: /usr/           tail: tmp
2727
 *            abc.d           -->  dir: [empty string]  tail: abc.d
2728
 *      (4  Consider the first example above: /usr/tmp/abc.d.
2729
 *          Suppose you want the stem of the file, abc, without either
2730
 *          the directory or the extension.  This can be extracted in two steps:
2731
 *              splitPathAtDirectory("usr/tmp/abc.d", NULL, &tail);
2732
 *                   [sets tail: "abc.d"]
2733
 *              splitPathAtExtension(tail, &basename, NULL);
2734
 *                   [sets basename: "abc"]
2735
 *      (5) The input can have either forward (unix) or backward (win)
2736
 *          slash separators.  The output has unix separators.
2737
 *          Note that Win32 pathname functions generally accept both
2738
 *          slash forms, but the Windows command line interpreter
2739
 *          only accepts backward slashes, because forward slashes are
2740
 *          used to demarcate switches (vs. dashes in unix).
2741
 * </pre>
2742
 */
2743
l_ok
2744
splitPathAtDirectory(const char  *pathname,
2745
                     char       **pdir,
2746
                     char       **ptail)
2747
5.19k
{
2748
5.19k
char  *cpathname, *lastslash;
2749
2750
5.19k
    if (!pdir && !ptail)
2751
0
        return ERROR_INT("null input for both strings", __func__, 1);
2752
5.19k
    if (pdir) *pdir = NULL;
2753
5.19k
    if (ptail) *ptail = NULL;
2754
5.19k
    if (!pathname)
2755
0
        return ERROR_INT("pathname not defined", __func__, 1);
2756
2757
5.19k
    cpathname = stringNew(pathname);
2758
5.19k
    convertSepCharsInPath(cpathname, UNIX_PATH_SEPCHAR);
2759
5.19k
    lastslash = strrchr(cpathname, '/');
2760
5.19k
    if (lastslash) {
2761
5.19k
        if (ptail)
2762
5.19k
            *ptail = stringNew(lastslash + 1);
2763
5.19k
        if (pdir) {
2764
0
            *(lastslash + 1) = '\0';
2765
0
            *pdir = cpathname;
2766
5.19k
        } else {
2767
5.19k
            LEPT_FREE(cpathname);
2768
5.19k
        }
2769
5.19k
    } else {  /* no directory */
2770
0
        if (pdir)
2771
0
            *pdir = stringNew("");
2772
0
        if (ptail)
2773
0
            *ptail = cpathname;
2774
0
        else
2775
0
            LEPT_FREE(cpathname);
2776
0
    }
2777
2778
5.19k
    return 0;
2779
5.19k
}
2780
2781
2782
/*!
2783
 * \brief   splitPathAtExtension()
2784
 *
2785
 * \param[in]    pathname    full path; can be a directory
2786
 * \param[out]   pbasename   [optional] pathname not including the
2787
 *                           last dot and characters after that
2788
 * \param[out]   pextension  [optional] path extension, which is
2789
 *                           the last dot and the characters after it.  If
2790
 *                           there is no extension, it returns the empty string
2791
 * \return  0 if OK, 1 on error
2792
 *
2793
 * <pre>
2794
 * Notes:
2795
 *      (1) If you only want the extension, input null for the basename ptr.
2796
 *      (2) If you only want the basename without extension, input null
2797
 *          for the extension ptr.
2798
 *      (3) This function makes decisions based only on the lexical
2799
 *          structure of the input.  Examples:
2800
 *            /usr/tmp/abc.jpg  -->  basename: /usr/tmp/abc    ext: .jpg
2801
 *            /usr/tmp/.jpg     -->  basename: /usr/tmp/       ext: .jpg
2802
 *            /usr/tmp.jpg/     -->  basename: /usr/tmp.jpg/   ext: [empty str]
2803
 *            ./.jpg            -->  basename: ./              ext: .jpg
2804
 *      (4) The input can have either forward (unix) or backward (win)
2805
 *          slash separators.  The output has unix separators.
2806
 *      (5) Note that basename, as used here, is different from the result
2807
 *          of the unix program 'basename'.  Here, basename is the entire
2808
 *          pathname up to a final extension and its preceding dot.
2809
 * </pre>
2810
 */
2811
l_ok
2812
splitPathAtExtension(const char  *pathname,
2813
                     char       **pbasename,
2814
                     char       **pextension)
2815
0
{
2816
0
char  *tail, *dir, *lastdot;
2817
0
char   empty[4] = "";
2818
2819
0
    if (!pbasename && !pextension)
2820
0
        return ERROR_INT("null input for both strings", __func__, 1);
2821
0
    if (pbasename) *pbasename = NULL;
2822
0
    if (pextension) *pextension = NULL;
2823
0
    if (!pathname)
2824
0
        return ERROR_INT("pathname not defined", __func__, 1);
2825
2826
        /* Split out the directory first */
2827
0
    splitPathAtDirectory(pathname, &dir, &tail);
2828
2829
        /* Then look for a "." in the tail part.
2830
         * This way we ignore all "." in the directory. */
2831
0
    if ((lastdot = strrchr(tail, '.'))) {
2832
0
        if (pextension)
2833
0
            *pextension = stringNew(lastdot);
2834
0
        if (pbasename) {
2835
0
            *lastdot = '\0';
2836
0
            *pbasename = stringJoin(dir, tail);
2837
0
        }
2838
0
    } else {
2839
0
        if (pextension)
2840
0
            *pextension = stringNew(empty);
2841
0
        if (pbasename)
2842
0
            *pbasename = stringNew(pathname);
2843
0
    }
2844
0
    LEPT_FREE(dir);
2845
0
    LEPT_FREE(tail);
2846
0
    return 0;
2847
0
}
2848
2849
2850
/*!
2851
 * \brief   pathJoin()
2852
 *
2853
 * \param[in]    dir     [optional] can be null
2854
 * \param[in]    fname   [optional] can be null
2855
 * \return  specially concatenated path, or NULL on error
2856
 *
2857
 * <pre>
2858
 * Notes:
2859
 *      (1) Use unix-style pathname separators ('/').
2860
 *      (2) %fname can be the entire path, or part of the path containing
2861
 *          at least one directory, or a tail without a directory, or NULL.
2862
 *      (3) It produces a path that strips multiple slashes to a single
2863
 *          slash, joins %dir and %fname by a slash, and has no trailing
2864
 *          slashes (except in the cases where %dir == "/" and
2865
 *          %fname == NULL, or v.v.).
2866
 *      (4) If both %dir and %fname are null, produces an empty string.
2867
 *      (5) Neither %dir nor %fname can begin with '..'.
2868
 *      (6) The result is not canonicalized or tested for correctness:
2869
 *          garbage in (e.g., /&%), garbage out.
2870
 *      (7) Examples:
2871
 *             //tmp// + //abc/  -->  /tmp/abc
2872
 *             tmp/ + /abc/      -->  tmp/abc
2873
 *             tmp/ + abc/       -->  tmp/abc
2874
 *             /tmp/ + ///       -->  /tmp
2875
 *             /tmp/ + NULL      -->  /tmp
2876
 *             // + /abc//       -->  /abc
2877
 *             // + NULL         -->  /
2878
 *             NULL + /abc/def/  -->  /abc/def
2879
 *             NULL + abc//      -->  abc
2880
 *             NULL + //         -->  /
2881
 *             NULL + NULL       -->  (empty string)
2882
 *             "" + ""           -->  (empty string)
2883
 *             "" + /            -->  /
2884
 *             ".." + /etc/foo   -->  NULL
2885
 *             /tmp + ".."       -->  NULL
2886
 * </pre>
2887
 */
2888
char *
2889
pathJoin(const char  *dir,
2890
         const char  *fname)
2891
0
{
2892
0
const char *slash = "/";
2893
0
char       *str, *dest;
2894
0
l_int32     i, n1, n2, emptydir;
2895
0
size_t      size;
2896
0
SARRAY     *sa1, *sa2;
2897
0
L_BYTEA    *ba;
2898
2899
0
    if (!dir && !fname)
2900
0
        return stringNew("");
2901
0
    if (dir && strlen(dir) >= 2 && dir[0] == '.' && dir[1] == '.')
2902
0
        return (char *)ERROR_PTR("dir starts with '..'", __func__, NULL);
2903
0
    if (fname && strlen(fname) >= 2 && fname[0] == '.' && fname[1] == '.')
2904
0
        return (char *)ERROR_PTR("fname starts with '..'", __func__, NULL);
2905
2906
0
    sa1 = sarrayCreate(0);
2907
0
    sa2 = sarrayCreate(0);
2908
0
    ba = l_byteaCreate(4);
2909
2910
        /* Process %dir */
2911
0
    if (dir && strlen(dir) > 0) {
2912
0
        if (dir[0] == '/')
2913
0
            l_byteaAppendString(ba, slash);
2914
0
        sarraySplitString(sa1, dir, "/");  /* removes all slashes */
2915
0
        n1 = sarrayGetCount(sa1);
2916
0
        for (i = 0; i < n1; i++) {
2917
0
            str = sarrayGetString(sa1, i, L_NOCOPY);
2918
0
            l_byteaAppendString(ba, str);
2919
0
            l_byteaAppendString(ba, slash);
2920
0
        }
2921
0
    }
2922
2923
        /* Special case to add leading slash: dir NULL or empty string  */
2924
0
    emptydir = dir && strlen(dir) == 0;
2925
0
    if ((!dir || emptydir) && fname && strlen(fname) > 0 && fname[0] == '/')
2926
0
        l_byteaAppendString(ba, slash);
2927
2928
        /* Process %fname */
2929
0
    if (fname && strlen(fname) > 0) {
2930
0
        sarraySplitString(sa2, fname, "/");
2931
0
        n2 = sarrayGetCount(sa2);
2932
0
        for (i = 0; i < n2; i++) {
2933
0
            str = sarrayGetString(sa2, i, L_NOCOPY);
2934
0
            l_byteaAppendString(ba, str);
2935
0
            l_byteaAppendString(ba, slash);
2936
0
        }
2937
0
    }
2938
2939
        /* Remove trailing slash */
2940
0
    dest = (char *)l_byteaCopyData(ba, &size);
2941
0
    if (size > 1 && dest[size - 1] == '/')
2942
0
        dest[size - 1] = '\0';
2943
2944
0
    sarrayDestroy(&sa1);
2945
0
    sarrayDestroy(&sa2);
2946
0
    l_byteaDestroy(&ba);
2947
0
    return dest;
2948
0
}
2949
2950
2951
/*!
2952
 * \brief   appendSubdirs()
2953
 *
2954
 * \param[in]    basedir
2955
 * \param[in]    subdirs
2956
 * \return  concatenated full directory path without trailing slash,
2957
 *              or NULL on error
2958
 *
2959
 * <pre>
2960
 * Notes:
2961
 *      (1) Use unix pathname separators
2962
 *      (2) Allocates a new string:  [basedir]/[subdirs]
2963
 * </pre>
2964
 */
2965
char *
2966
appendSubdirs(const char  *basedir,
2967
              const char  *subdirs)
2968
0
{
2969
0
char   *newdir;
2970
0
size_t  len1, len2, len3, len4;
2971
2972
0
    if (!basedir || !subdirs)
2973
0
        return (char *)ERROR_PTR("basedir and subdirs not both defined",
2974
0
                                 __func__, NULL);
2975
2976
0
    len1 = strlen(basedir);
2977
0
    len2 = strlen(subdirs);
2978
0
    len3 = len1 + len2 + 8;
2979
0
    if ((newdir = (char *)LEPT_CALLOC(len3, 1)) == NULL)
2980
0
        return (char *)ERROR_PTR("newdir not made", __func__, NULL);
2981
0
    stringCat(newdir, len3, basedir);
2982
0
    if (newdir[len1 - 1] != '/')  /* add '/' if necessary */
2983
0
        newdir[len1] = '/';
2984
0
    if (subdirs[0] == '/')  /* add subdirs, stripping leading '/' */
2985
0
        stringCat(newdir, len3, subdirs + 1);
2986
0
    else
2987
0
        stringCat(newdir, len3, subdirs);
2988
0
    len4 = strlen(newdir);
2989
0
    if (newdir[len4 - 1] == '/')  /* strip trailing '/' */
2990
0
        newdir[len4 - 1] = '\0';
2991
2992
0
    return newdir;
2993
0
}
2994
2995
2996
/*--------------------------------------------------------------------*
2997
 *                     Special file name operations                   *
2998
 *--------------------------------------------------------------------*/
2999
/*!
3000
 * \brief   convertSepCharsInPath()
3001
 *
3002
 * \param[in]    path
3003
 * \param[in]    type    UNIX_PATH_SEPCHAR, WIN_PATH_SEPCHAR
3004
 * \return  0 if OK, 1 on error
3005
 *
3006
 * <pre>
3007
 * Notes:
3008
 *      (1) In-place conversion.
3009
 *      (2) Type is the resulting type:
3010
 *            * UNIX_PATH_SEPCHAR:  '\\' ==> '/'
3011
 *            * WIN_PATH_SEPCHAR:   '/' ==> '\\'
3012
 *      (3) Virtually all path operations in leptonica use unix separators.
3013
 *      (4) The backslash is a valid character in unix pathnames and should
3014
 *          not be converted.  Each backslash needs to be escaped with a
3015
 *          preceding backslash for the shell, but the actual filename
3016
 *          does not include these escape characters.
3017
 * </pre>
3018
 */
3019
l_ok
3020
convertSepCharsInPath(char    *path,
3021
                      l_int32  type)
3022
10.3k
{
3023
10.3k
l_int32  i;
3024
10.3k
size_t   len;
3025
3026
10.3k
    if (!path)
3027
0
        return ERROR_INT("path not defined", __func__, 1);
3028
10.3k
    if (type != UNIX_PATH_SEPCHAR && type != WIN_PATH_SEPCHAR)
3029
0
        return ERROR_INT("invalid type", __func__, 1);
3030
3031
10.3k
    len = strlen(path);
3032
10.3k
    if (type == UNIX_PATH_SEPCHAR) {
3033
#ifdef _WIN32  /* only convert on Windows */
3034
        for (i = 0; i < len; i++) {
3035
            if (path[i] == '\\')
3036
                path[i] = '/';
3037
        }
3038
#endif  /* _WIN32 */
3039
10.3k
    } else {  /* WIN_PATH_SEPCHAR */
3040
0
        for (i = 0; i < len; i++) {
3041
0
            if (path[i] == '/')
3042
0
                path[i] = '\\';
3043
0
        }
3044
0
    }
3045
10.3k
    return 0;
3046
10.3k
}
3047
3048
3049
/*!
3050
 * \brief   genPathname()
3051
 *
3052
 * \param[in]    dir     [optional] directory or full path name,
3053
 *                       with or without the trailing '/'
3054
 * \param[in]    fname   [optional] file name within a directory
3055
 * \return  pathname either a directory or full path, or NULL on error
3056
 *
3057
 * <pre>
3058
 * Notes:
3059
 *      (1) This function generates actual paths in the following ways:
3060
 *            * from two sub-parts (e.g., a directory and a file name).
3061
 *            * from a single path full path, placed in %dir, with
3062
 *              %fname == NULL.
3063
 *            * from the name of a file in the local directory placed in
3064
 *              %fname, with %dir == NULL.
3065
 *            * if in a "/tmp" directory and on iOS, macOS or Windows,
3066
 *              the OS specific temp directory is used.
3067
 *      (2) This does an automatic directory translation for operating
3068
 *          systems that use a different path for /tmp.
3069
 *          That path is determined
3070
 *             * on Windows: by GetTempPath()
3071
 *             * on macOS, iOS: by confstr() (see man page)
3072
 *      (3) On unix, the TMPDIR variable is ignored.  No rewriting
3073
 *          of temp directories is permitted.
3074
 *      (4) There are four cases for the input:
3075
 *          (a) %dir is a directory and %fname is defined: result is a
3076
 *              full path
3077
 *          (b) %dir is a directory and %fname is null: result is a directory
3078
 *          (c) %dir is a full path and %fname is null: result is a full path
3079
 *          (d) %dir is null or an empty string: start in the current dir;
3080
 *              result is a full path
3081
 *      (5) In all cases, the resulting pathname is not terminated with a slash
3082
 *      (6) The caller is responsible for freeing the returned pathname.
3083
 * </pre>
3084
 */
3085
char *
3086
genPathname(const char  *dir,
3087
            const char  *fname)
3088
5.19k
{
3089
#if defined(REWRITE_TMP)
3090
l_int32  rewrite_tmp = TRUE;
3091
#else
3092
5.19k
l_int32  rewrite_tmp = FALSE;
3093
5.19k
#endif  /* REWRITE_TMP */
3094
5.19k
char    *cdir, *pathout;
3095
5.19k
l_int32  dirlen, namelen;
3096
5.19k
size_t   size;
3097
3098
5.19k
    if (!dir && !fname)
3099
0
        return (char *)ERROR_PTR("no input", __func__, NULL);
3100
3101
        /* Handle the case where we start from the current directory */
3102
5.19k
    if (!dir || dir[0] == '\0') {
3103
0
        if ((cdir = getcwd(NULL, 0)) == NULL)
3104
0
            return (char *)ERROR_PTR("no current dir found", __func__, NULL);
3105
5.19k
    } else {
3106
5.19k
        if ((cdir = stringNew(dir)) == NULL)
3107
0
            return (char *)ERROR_PTR("stringNew failed", __func__, NULL);
3108
5.19k
    }
3109
3110
        /* Convert to unix path separators, and remove the trailing
3111
         * slash in the directory, except when dir == "/"  */
3112
5.19k
    convertSepCharsInPath(cdir, UNIX_PATH_SEPCHAR);
3113
5.19k
    dirlen = strlen(cdir);
3114
5.19k
    if (cdir[dirlen - 1] == '/' && dirlen != 1) {
3115
0
        cdir[dirlen - 1] = '\0';
3116
0
        dirlen--;
3117
0
    }
3118
3119
5.19k
    namelen = (fname) ? strlen(fname) : 0;
3120
5.19k
    size = dirlen + namelen + 256;
3121
5.19k
    if ((pathout = (char *)LEPT_CALLOC(size, sizeof(char))) == NULL) {
3122
0
        LEPT_FREE(cdir);
3123
0
        return (char *)ERROR_PTR("pathout not made", __func__, NULL);
3124
0
    }
3125
3126
        /* First handle %dir (which may be a full pathname).
3127
         * There is no path rewriting on unix, and on win32, we do not
3128
         * rewrite unless the specified directory is /tmp or
3129
         * a subdirectory of /tmp */
3130
5.19k
    if (!rewrite_tmp || dirlen < 4 ||
3131
5.19k
        (dirlen == 4 && strncmp(cdir, "/tmp", 4) != 0) ||  /* not in "/tmp" */
3132
5.19k
        (dirlen > 4 && strncmp(cdir, "/tmp/", 5) != 0)) {  /* not in "/tmp/" */
3133
5.19k
        stringCopy(pathout, cdir, dirlen);
3134
5.19k
    } else {  /* Rewrite with "/tmp" specified for the directory. */
3135
#if defined(__APPLE__)
3136
        size_t n = confstr(_CS_DARWIN_USER_TEMP_DIR, pathout, size);
3137
        if (n == 0 || n > size) {
3138
            /* Fall back to using /tmp */
3139
            stringCopy(pathout, cdir, dirlen);
3140
        } else {
3141
            /* Add the rest of cdir */
3142
            if (dirlen > 4)
3143
                stringCat(pathout, size, cdir + 4);
3144
        }
3145
#elif defined(_WIN32)
3146
        l_int32 tmpdirlen;
3147
        char tmpdir[MAX_PATH];
3148
        GetTempPathA(sizeof(tmpdir), tmpdir);  /* get the Windows temp dir */
3149
        tmpdirlen = strlen(tmpdir);
3150
        if (tmpdirlen > 0 && tmpdir[tmpdirlen - 1] == '\\') {
3151
            tmpdir[tmpdirlen - 1] = '\0';  /* trim the trailing '\' */
3152
        }
3153
        tmpdirlen = strlen(tmpdir);
3154
        stringCopy(pathout, tmpdir, tmpdirlen);
3155
3156
            /* Add the rest of cdir */
3157
        if (dirlen > 4)
3158
            stringCat(pathout, size, cdir + 4);
3159
#endif  /* _WIN32 */
3160
0
    }
3161
3162
        /* Now handle %fname */
3163
5.19k
    if (fname && strlen(fname) > 0) {
3164
0
        dirlen = strlen(pathout);
3165
0
        pathout[dirlen] = '/';
3166
0
        stringCat(pathout, size, fname);
3167
0
    }
3168
3169
5.19k
    LEPT_FREE(cdir);
3170
5.19k
    return pathout;
3171
5.19k
}
3172
3173
3174
/*!
3175
 * \brief   makeTempDirname()
3176
 *
3177
 * \param[in]    result    preallocated on stack or heap and passed in
3178
 * \param[in]    nbytes    size of %result array, in bytes
3179
 * \param[in]    subdir    [optional]; can be NULL or an empty string
3180
 * \return  0 if OK, 1 on error
3181
 *
3182
 * <pre>
3183
 * Notes:
3184
 *      (1) This generates the directory path for output temp files,
3185
 *          written into %result with unix separators.
3186
 *      (2) Caller allocates %result, large enough to hold the path,
3187
 *          which is:
3188
 *            /tmp/%subdir       (unix)
3189
 *            [Temp]/%subdir     (Windows, macOS, iOS)
3190
 *          where [Temp] is the OS path
3191
 *          and %subdir is in general a set of nested subdirectories:
3192
 *            dir1/dir2/.../dirN
3193
 *          which in use would not typically exceed 2 levels.
3194
 *      (3) Usage example:
3195
 * \code
3196
 *           char  result[256];
3197
 *           makeTempDirname(result, sizeof(result), "lept/golden");
3198
 * \endcode
3199
 * </pre>
3200
 */
3201
l_ok
3202
makeTempDirname(char        *result,
3203
                size_t       nbytes,
3204
                const char  *subdir)
3205
0
{
3206
0
char    *dir, *path;
3207
0
l_int32  ret = 0;
3208
0
size_t   pathlen;
3209
3210
0
    if (!result)
3211
0
        return ERROR_INT("result not defined", __func__, 1);
3212
0
    if (subdir && ((subdir[0] == '.') || (subdir[0] == '/')))
3213
0
        return ERROR_INT("subdir not an actual subdirectory", __func__, 1);
3214
3215
0
    memset(result, 0, nbytes);
3216
3217
0
    dir = pathJoin("/tmp", subdir);
3218
3219
#if defined(REWRITE_TMP)
3220
    path = genPathname(dir, NULL);
3221
#else
3222
0
    path = stringNew(dir);
3223
0
#endif  /*  ~ _WIN32 */
3224
0
    pathlen = strlen(path);
3225
0
    if (pathlen < nbytes - 1) {
3226
0
        stringCopy(result, path, nbytes);
3227
0
    } else {
3228
0
        L_ERROR("result array too small for path\n", __func__);
3229
0
        ret = 1;
3230
0
    }
3231
3232
0
    LEPT_FREE(dir);
3233
0
    LEPT_FREE(path);
3234
0
    return ret;
3235
0
}
3236
3237
3238
/*!
3239
 * \brief   modifyTrailingSlash()
3240
 *
3241
 * \param[in]    path     preallocated on stack or heap and passed in
3242
 * \param[in]    nbytes   size of %path array, in bytes
3243
 * \param[in]    flag     L_ADD_TRAIL_SLASH or L_REMOVE_TRAIL_SLASH
3244
 * \return  0 if OK, 1 on error
3245
 *
3246
 * <pre>
3247
 * Notes:
3248
 *      (1) This carries out the requested action if necessary.
3249
 * </pre>
3250
 */
3251
l_ok
3252
modifyTrailingSlash(char    *path,
3253
                    size_t   nbytes,
3254
                    l_int32  flag)
3255
0
{
3256
0
char    lastchar;
3257
0
size_t  len;
3258
3259
0
    if (!path)
3260
0
        return ERROR_INT("path not defined", __func__, 1);
3261
0
    if (flag != L_ADD_TRAIL_SLASH && flag != L_REMOVE_TRAIL_SLASH)
3262
0
        return ERROR_INT("invalid flag", __func__, 1);
3263
3264
0
    len = strlen(path);
3265
0
    lastchar = path[len - 1];
3266
0
    if (flag == L_ADD_TRAIL_SLASH && lastchar != '/' && len < nbytes - 2) {
3267
0
        path[len] = '/';
3268
0
        path[len + 1] = '\0';
3269
0
    } else if (flag == L_REMOVE_TRAIL_SLASH && lastchar == '/') {
3270
0
        path[len - 1] = '\0';
3271
0
    }
3272
0
    return 0;
3273
0
}
3274
3275
3276
/*!
3277
 * \brief   l_makeTempFilename()
3278
 *
3279
 * \return  fname : heap allocated filename; returns NULL on failure.
3280
 *
3281
 * <pre>
3282
 * Notes:
3283
 *      (1) On unix, this makes a filename of the form
3284
 *               "/tmp/lept.XXXXXX",
3285
 *          where each X is a random character.
3286
 *      (2) On Windows, this makes a filename of the form
3287
 *               "/[Temp]/lp.XXXXXX".
3288
 *      (3) On all systems, this fails if the file is not writable.
3289
 *      (4) Safest usage is to write to a subdirectory in debug code.
3290
 *      (5) The returned filename must be freed by the caller, using lept_free.
3291
 *      (6) The tail of the filename has a '.', so that cygwin interprets
3292
 *          the file as having an extension.  Otherwise, cygwin assumes it
3293
 *          is an executable and appends ".exe" to the filename.
3294
 *      (7) On unix, whenever possible use tmpfile() instead.  tmpfile()
3295
 *          hides the file name, returns a stream opened for write,
3296
 *          and deletes the temp file when the stream is closed.
3297
 * </pre>
3298
 */
3299
char *
3300
l_makeTempFilename(void)
3301
0
{
3302
0
char  dirname[240];
3303
3304
0
    if (makeTempDirname(dirname, sizeof(dirname), NULL) == 1)
3305
0
        return (char *)ERROR_PTR("failed to make dirname", __func__, NULL);
3306
3307
0
#ifndef _WIN32
3308
0
{
3309
0
    char    *pattern;
3310
0
    l_int32  fd;
3311
0
    pattern = stringConcatNew(dirname, "/lept.XXXXXX", NULL);
3312
0
    fd = mkstemp(pattern);
3313
0
    if (fd == -1) {
3314
0
        LEPT_FREE(pattern);
3315
0
        return (char *)ERROR_PTR("mkstemp failed", __func__, NULL);
3316
0
    }
3317
0
    close(fd);
3318
0
    return pattern;
3319
0
}
3320
#else
3321
{
3322
    char  fname[MAX_PATH];
3323
    FILE *fp;
3324
    if (GetTempFileNameA(dirname, "lp.", 0, fname) == 0)
3325
        return (char *)ERROR_PTR("GetTempFileName failed", __func__, NULL);
3326
    if ((fp = fopen(fname, "wb")) == NULL)
3327
        return (char *)ERROR_PTR("file cannot be written to", __func__, NULL);
3328
    fclose(fp);
3329
    return stringNew(fname);
3330
}
3331
#endif  /*  ~ _WIN32 */
3332
0
}
3333
3334
3335
/*!
3336
 * \brief   extractNumberFromFilename()
3337
 *
3338
 * \param[in]    fname
3339
 * \param[in]    numpre    number of characters before the digits to be found
3340
 * \param[in]    numpost   number of characters after the digits to be found
3341
 * \return  num number embedded in the filename; -1 on error or if
3342
 *                   not found
3343
 *
3344
 * <pre>
3345
 * Notes:
3346
 *      (1) The number is to be found in the basename, which is the
3347
 *          filename without either the directory or the last extension.
3348
 *      (2) When a number is found, it is non-negative.  If no number
3349
 *          is found, this returns -1, without an error message.  The
3350
 *          caller needs to check.
3351
 * </pre>
3352
 */
3353
l_int32
3354
extractNumberFromFilename(const char  *fname,
3355
                          l_int32      numpre,
3356
                          l_int32      numpost)
3357
0
{
3358
0
char    *tail, *basename;
3359
0
l_int32  len, nret, num;
3360
3361
0
    if (!fname)
3362
0
        return ERROR_INT("fname not defined", __func__, -1);
3363
3364
0
    splitPathAtDirectory(fname, NULL, &tail);
3365
0
    splitPathAtExtension(tail, &basename, NULL);
3366
0
    LEPT_FREE(tail);
3367
3368
0
    len = strlen(basename);
3369
0
    if (numpre + numpost > len - 1) {
3370
0
        LEPT_FREE(basename);
3371
0
        return ERROR_INT("numpre + numpost too big", __func__, -1);
3372
0
    }
3373
3374
0
    basename[len - numpost] = '\0';
3375
0
    nret = sscanf(basename + numpre, "%d", &num);
3376
0
    LEPT_FREE(basename);
3377
3378
0
    if (nret == 1)
3379
0
        return num;
3380
0
    else
3381
0
        return -1;  /* not found */
3382
0
}