Coverage Report

Created: 2026-04-01 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/leptonica/src/utils2.c
Line
Count
Source
1
/*====================================================================*
2
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
3
 -
4
 -  Redistribution and use in source and binary forms, with or without
5
 -  modification, are permitted provided that the following conditions
6
 -  are met:
7
 -  1. Redistributions of source code must retain the above copyright
8
 -     notice, this list of conditions and the following disclaimer.
9
 -  2. Redistributions in binary form must reproduce the above
10
 -     copyright notice, this list of conditions and the following
11
 -     disclaimer in the documentation and/or other materials
12
 -     provided with the distribution.
13
 -
14
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
 *====================================================================*/
26
27
/*!
28
 * \file utils2.c
29
 * <pre>
30
 *
31
 *      ------------------------------------------
32
 *      This file has these utilities:
33
 *         - safe string operations
34
 *         - find/replace operations on strings
35
 *         - read/write between file and memory
36
 *         - multi-platform file and directory operations
37
 *         - file name operations
38
 *      ------------------------------------------
39
 *
40
 *       Safe string procs
41
 *           char      *stringNew()
42
 *           l_int32    stringCopy()
43
 *           l_int32    stringCopySegment()
44
 *           l_int32    stringReplace()
45
 *           l_int32    stringLength()
46
 *           l_int32    stringCat()
47
 *           char      *stringConcatNew()
48
 *           char      *stringJoin()
49
 *           l_int32    stringJoinIP()
50
 *           char      *stringReverse()
51
 *           char      *strtokSafe()
52
 *           l_int32    stringSplitOnToken()
53
 *
54
 *       Find and replace string and array procs
55
 *           l_int32    stringCheckForChars()
56
 *           char      *stringRemoveChars()
57
 *           char      *stringReplaceEachSubstr()
58
 *           char      *stringReplaceSubstr()
59
 *           L_DNA     *stringFindEachSubstr()
60
 *           l_int32    stringFindSubstr()
61
 *           l_uint8   *arrayReplaceEachSequence()
62
 *           L_DNA     *arrayFindEachSequence()
63
 *           l_int32    arrayFindSequence()
64
 *
65
 *       Safe realloc
66
 *           void      *reallocNew()
67
 *
68
 *       Read and write between file and memory
69
 *           l_uint8   *l_binaryRead()
70
 *           l_uint8   *l_binaryReadStream()
71
 *           l_uint8   *l_binaryReadSelect()
72
 *           l_uint8   *l_binaryReadSelectStream()
73
 *           l_int32    l_binaryWrite()
74
 *           l_int32    nbytesInFile()
75
 *           l_int32    fnbytesInFile()
76
 *
77
 *       Copy and compare in memory
78
 *           l_uint8   *l_binaryCopy()
79
 *           l_uint8   *l_binaryCompare()
80
 *
81
 *       File copy operations
82
 *           l_int32    fileCopy()
83
 *           l_int32    fileConcatenate()
84
 *           l_int32    fileAppendString()
85
 *
86
 *       File split operations
87
 *           l_int32    fileSplitLinesUniform()
88
 *
89
 *       Multi-platform functions for opening file streams
90
 *           FILE      *fopenReadStream()
91
 *           FILE      *fopenWriteStream()
92
 *           FILE      *fopenReadFromMemory()
93
 *
94
 *       Opening a Windows tmpfile for writing
95
 *           FILE      *fopenWriteWinTempfile()
96
 *
97
 *       Multi-platform functions that avoid C-runtime boundary crossing
98
 *       with Windows DLLs  (use in programs only)
99
 *           FILE      *lept_fopen()
100
 *           l_int32    lept_fclose()
101
 *           void      *lept_calloc()
102
 *           void       lept_free()
103
 *
104
 *       Multi-platform file system operations in temp directories
105
 *           l_int32    lept_mkdir()
106
 *           l_int32    lept_rmdir()
107
 *           l_int32    lept_direxists()
108
 *           l_int32    lept_mv()
109
 *           l_int32    lept_rm_match()
110
 *           l_int32    lept_rm()
111
 *           l_int32    lept_rmfile()
112
 *           l_int32    lept_cp()
113
 *
114
 *       Special debug/test function for calling 'system'
115
 *           l_int32    callSystemDebug()
116
 *
117
 *       General file name operations
118
 *           l_int32    splitPathAtDirectory()
119
 *           l_int32    splitPathAtExtension()
120
 *           char      *pathJoin()
121
 *           char      *appendSubdirs()
122
 *
123
 *       Special file name operations
124
 *           l_int32    convertSepCharsInPath()
125
 *           char      *genPathname()
126
 *           l_int32    makeTempDirname()
127
 *           l_int32    modifyTrailingSlash()
128
 *           char      *l_makeTempFilename()
129
 *           l_int32    extractNumberFromFilename()
130
 *
131
 *
132
 *  Notes on multi-platform development
133
 *  -----------------------------------
134
 *  This is important:
135
 *  (1) With the exception of splitPathAtDirectory(), splitPathAtExtension()
136
  *     and genPathname(), all input pathnames must have unix separators.
137
 *  (2) On macOS, iOS and Windows, for read or write to "/tmp/..."
138
 *      the filename is rewritten to use the OS specific temp directory:
139
 *         /tmp  ==>   [Temp]/...
140
 *  (3) This filename rewrite, along with the conversion from unix
141
 *      to OS specific pathnames, happens in genPathname().
142
 *  (4) Use fopenReadStream() and fopenWriteStream() to open files,
143
 *      because these use genPathname() to find the platform-dependent
144
 *      filenames.  Likewise for l_binaryRead() and l_binaryWrite().
145
 *  (5) For moving, copying and removing files and directories that are in
146
 *      subdirectories of /tmp, use the lept_*() file system shell wrappers:
147
 *         lept_mkdir(), lept_rmdir(), lept_mv(), lept_rm() and lept_cp().
148
 *  (6) For programs use the lept_fopen(), lept_fclose(), lept_calloc()
149
 *      and lept_free() C library wrappers.  These work properly on Windows,
150
 *      where the same DLL must perform complementary operations on
151
 *      file streams (open/close) and heap memory (malloc/free).
152
 *  (7) Why read and write files to temp directories?
153
 *      The library needs the ability to read and write ephemeral
154
 *      files to default places, both for generating debugging output
155
 *      and for supporting regression tests.  Applications also need
156
 *      this ability for debugging.
157
 *  (8) Why do the pathname rewrite on macOS, iOS and Windows?
158
 *      The goal is to have the library, and programs using the library,
159
 *      run on multiple platforms without changes.  The location of
160
 *      temporary files depends on the platform as well as the user's
161
 *      configuration.  Temp files on some operating systems are in some
162
 *      directory not known a priori.  To make everything work seamlessly on
163
 *      any OS, every time you open a file for reading or writing,
164
 *      use a special function such as fopenReadStream() or
165
 *      fopenWriteStream(); these call genPathname() to ensure that
166
 *      if it is a temp file, the correct path is used.  To indicate
167
 *      that this is a temp file, the application is written with the
168
 *      root directory of the path in a canonical form: "/tmp".
169
 *  (9) Why is it that multi-platform directory functions like lept_mkdir()
170
 *      and lept_rmdir(), as well as associated file functions like
171
 *      lept_rm(), lept_mv() and lept_cp(), only work in the temp dir?
172
 *      These functions were designed to provide easy manipulation of
173
 *      temp files.  The restriction to temp files is for safety -- to
174
 *      prevent an accidental deletion of important files.  For example,
175
 *      lept_rmdir() first deletes all files in a specified subdirectory
176
 *      of temp, and then removes the directory.
177
 *
178
 * </pre>
179
 */
180
181
#ifdef HAVE_CONFIG_H
182
#include <config_auto.h>
183
#endif  /* HAVE_CONFIG_H */
184
185
#ifdef _MSC_VER
186
#include <process.h>
187
#include <direct.h>
188
#define getcwd _getcwd  /* fix MSVC warning */
189
#else
190
#include <unistd.h>
191
#endif   /* _MSC_VER */
192
193
#ifdef _WIN32
194
#include <windows.h>
195
#include <fcntl.h>     /* _O_CREAT, ... */
196
#include <io.h>        /* _open */
197
#include <sys/stat.h>  /* _S_IREAD, _S_IWRITE */
198
#else
199
#include <sys/stat.h>  /* for stat, mkdir(2) */
200
#include <sys/types.h>
201
#endif
202
203
#ifdef __APPLE__
204
#include <unistd.h>
205
#include <errno.h>
206
#endif
207
208
#include <string.h>
209
#include <stddef.h>
210
#include "allheaders.h"
211
212
#if defined(__APPLE__) || defined(_WIN32)
213
/* Rewrite paths starting with /tmp for macOS, iOS and Windows. */
214
#define REWRITE_TMP
215
#endif
216
217
/*--------------------------------------------------------------------*
218
 *                       Safe string operations                       *
219
 *--------------------------------------------------------------------*/
220
/*!
221
 * \brief   stringNew()
222
 *
223
 * \param[in]    src
224
 * \return  dest copy of %src string, or NULL on error
225
 */
226
char *
227
stringNew(const char  *src)
228
0
{
229
0
l_int32  len;
230
0
char    *dest;
231
232
0
    if (!src) {
233
0
        L_WARNING("src not defined\n", __func__);
234
0
        return NULL;
235
0
    }
236
237
0
    len = strlen(src);
238
0
    if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
239
0
        return (char *)ERROR_PTR("dest not made", __func__, NULL);
240
241
0
    stringCopy(dest, src, len);
242
0
    return dest;
243
0
}
244
245
246
/*!
247
 * \brief   stringCopy()
248
 *
249
 * \param[in]    dest    existing byte buffer
250
 * \param[in]    src     string [optional] can be null
251
 * \param[in]    n       max number of characters to copy
252
 * \return  0 if OK, 1 on error
253
 *
254
 * <pre>
255
 * Notes:
256
 *      (1) Relatively safe wrapper for strncpy, that checks the input,
257
 *          and does not complain if %src is null or %n < 1.
258
 *          If %n < 1, this is a no-op.
259
 *      (2) %dest needs to be at least %n bytes in size.
260
 *      (3) We don't call strncpy() because valgrind complains about
261
 *          use of uninitialized values.
262
 * </pre>
263
 */
264
l_ok
265
stringCopy(char        *dest,
266
           const char  *src,
267
           l_int32      n)
268
0
{
269
0
l_int32  i;
270
271
0
    if (!dest)
272
0
        return ERROR_INT("dest not defined", __func__, 1);
273
0
    if (!src || n < 1)
274
0
        return 0;
275
276
        /* Implementation of strncpy that valgrind doesn't complain about */
277
0
    for (i = 0; i < n && src[i] != '\0'; i++)
278
0
        dest[i] = src[i];
279
0
    for (; i < n; i++)
280
0
        dest[i] = '\0';
281
0
    return 0;
282
0
}
283
284
285
/*!
286
 * \brief   stringCopySegment()
287
 *
288
 *
289
 * \param[in]    src      string
290
 * \param[in]    start    byte position at start of segment
291
 * \param[in]    nbytes   number of bytes in the segment; use 0 to go to end
292
 * \return  copy of segment, or NULL on error
293
 *
294
 * <pre>
295
 * Notes:
296
 *      (1) This is a variant of stringNew() that makes a new string
297
 *          from a segment of the input string.  The segment is specified
298
 *          by the starting position and the number of bytes.
299
 *      (2) The start location %start must be within the string %src.
300
 *      (3) The copy is truncated to the end of the source string.
301
 *          Use %nbytes = 0 to copy to the end of %src.
302
 * </pre>
303
 */
304
char *
305
stringCopySegment(const char  *src,
306
                  l_int32      start,
307
                  l_int32      nbytes)
308
0
{
309
0
char    *dest;
310
0
l_int32  len;
311
312
0
    if (!src)
313
0
        return (char *)ERROR_PTR("src not defined", __func__, NULL);
314
0
    len = strlen(src);
315
0
    if (start < 0 || start > len - 1)
316
0
        return (char *)ERROR_PTR("invalid start", __func__, NULL);
317
0
    if (nbytes <= 0)  /* copy to the end */
318
0
        nbytes = len - start;
319
0
    if (start + nbytes > len)  /* truncate to the end */
320
0
        nbytes = len - start;
321
0
    if ((dest = (char *)LEPT_CALLOC(nbytes + 1, sizeof(char))) == NULL)
322
0
        return (char *)ERROR_PTR("dest not made", __func__, NULL);
323
0
    stringCopy(dest, src + start, nbytes);
324
0
    return dest;
325
0
}
326
327
328
/*!
329
 * \brief   stringReplace()
330
 *
331
 * \param[out]   pdest    string copy
332
 * \param[in]    src      [optional] string; can be null
333
 * \return  0 if OK; 1 on error
334
 *
335
 * <pre>
336
 * Notes:
337
 *      (1) Frees any existing dest string
338
 *      (2) Puts a copy of src string in the dest
339
 *      (3) If either or both strings are null, does something reasonable.
340
 * </pre>
341
 */
342
l_ok
343
stringReplace(char       **pdest,
344
              const char  *src)
345
3.23M
{
346
3.23M
    if (!pdest)
347
0
        return ERROR_INT("pdest not defined", __func__, 1);
348
349
3.23M
    if (*pdest)
350
0
        LEPT_FREE(*pdest);
351
352
3.23M
    if (src)
353
0
        *pdest = stringNew(src);
354
3.23M
    else
355
3.23M
        *pdest = NULL;
356
3.23M
    return 0;
357
3.23M
}
358
359
360
/*!
361
 * \brief   stringLength()
362
 *
363
 * \param[in]    src    string can be null or NULL-terminated string
364
 * \param[in]    size   number of bytes to check; e.g., size of src buffer
365
 * \return  length of src in bytes; 0 if no bytes are found;
366
 *                                  %size on error when NUL byte is not found.
367
 *
368
 * <pre>
369
 * Notes:
370
 *      (1) Safe implementation of strlen that only checks %size bytes
371
 *          for trailing NUL.
372
 *      (2) Valid returned string lengths are between 0 and size - 1.
373
 *          If %size bytes are checked without finding a NUL byte, then
374
 *          an error is indicated by returning %size.
375
 * </pre>
376
 */
377
l_int32
378
stringLength(const char  *src,
379
             size_t       size)
380
0
{
381
0
l_int32  i;
382
383
0
    if (!src)
384
0
        return 0;
385
0
    if (size < 1)
386
0
        return ERROR_INT("size < 1; too small", __func__, 0);
387
388
0
    for (i = 0; i < size; i++) {
389
0
        if (src[i] == '\0')
390
0
            return i;
391
0
    }
392
393
        /* Didn't find a NUL byte */
394
0
    L_ERROR("NUL byte not found in %zu bytes\n", __func__, size);
395
0
    return size;
396
0
}
397
398
399
/*!
400
 * \brief   stringCat()
401
 *
402
 * \param[in]    dest    null-terminated byte buffer
403
 * \param[in]    size    size of dest buffer
404
 * \param[in]    src     string can be null or NULL-terminated string
405
 * \return  number of bytes added to dest; -1 on error
406
 *
407
 * <pre>
408
 * Notes:
409
 *      (1) Alternative implementation of strncat, that checks the input,
410
 *          is easier to use (since the size of the dest buffer is specified
411
 *          rather than the number of bytes to copy), and does not complain
412
 *          if %src is null.
413
 *      (2) Never writes past end of dest.
414
 *      (3) If there is not enough room to append the src, which is an error,
415
 *          it does nothing.
416
 *      (4) N.B. The order of 2nd and 3rd args is reversed from that in
417
 *          strncat, as in the Windows function strcat_s().
418
 * </pre>
419
 */
420
l_int32
421
stringCat(char        *dest,
422
          size_t       size,
423
          const char  *src)
424
0
{
425
0
l_int32  i, n;
426
0
l_int32  lendest, lensrc;
427
428
0
    if (!dest)
429
0
        return ERROR_INT("dest not defined", __func__, -1);
430
0
    if (size < 1)
431
0
        return ERROR_INT("size < 1; too small", __func__, -1);
432
0
    if (!src)
433
0
        return 0;
434
435
0
    lendest = stringLength(dest, size);
436
0
    if (lendest == size)
437
0
        return ERROR_INT("no terminating nul byte", __func__, -1);
438
0
    lensrc = stringLength(src, size);
439
0
    if (lensrc == 0)
440
0
        return 0;  /* nothing added to dest */
441
0
    n = (lendest + lensrc > size - 1) ? 0 : lensrc;
442
0
    if (n == 0)
443
0
        return ERROR_INT("dest too small for append", __func__, -1);
444
445
0
    for (i = 0; i < n; i++)
446
0
        dest[lendest + i] = src[i];
447
0
    dest[lendest + n] = '\0';
448
0
    return n;
449
0
}
450
451
452
/*!
453
 * \brief   stringConcatNew()
454
 *
455
 * \param[in]    first    first string in list
456
 * \param[in]    ...      NULL-terminated list of strings
457
 * \return  result new string concatenating the input strings, or
458
 *                      NULL if first == NULL
459
 *
460
 * <pre>
461
 * Notes:
462
 *      (1) The last arg in the list of strings must be NULL.
463
 *      (2) Caller must free the returned string.
464
 * </pre>
465
 */
466
char *
467
stringConcatNew(const char  *first, ...)
468
0
{
469
0
size_t       len;
470
0
char        *result, *ptr;
471
0
const char  *arg;
472
0
va_list      args;
473
474
0
    if (!first) return NULL;
475
476
        /* Find the length of the output string */
477
0
    va_start(args, first);
478
0
    len = strlen(first);
479
0
    while ((arg = va_arg(args, const char *)) != NULL)
480
0
        len += strlen(arg);
481
0
    va_end(args);
482
0
    result = (char *)LEPT_CALLOC(len + 1, sizeof(char));
483
484
        /* Concatenate the args */
485
0
    va_start(args, first);
486
0
    ptr = result;
487
0
    arg = first;
488
0
    while (*arg)
489
0
        *ptr++ = *arg++;
490
0
    while ((arg = va_arg(args, const char *)) != NULL) {
491
0
        while (*arg)
492
0
            *ptr++ = *arg++;
493
0
    }
494
0
    va_end(args);
495
0
    return result;
496
0
}
497
498
499
/*!
500
 * \brief   stringJoin()
501
 *
502
 * \param[in]    src1    [optional] string; can be null
503
 * \param[in]    src2    [optional] string; can be null
504
 * \return  concatenated string, or NULL on error
505
 *
506
 * <pre>
507
 * Notes:
508
 *      (1) This is a safe version of strcat; it makes a new string.
509
 *      (2) It is not an error if either or both of the strings
510
 *          are empty, or if either or both of the pointers are null.
511
 * </pre>
512
 */
513
char *
514
stringJoin(const char  *src1,
515
           const char  *src2)
516
0
{
517
0
char    *dest;
518
0
l_int32  srclen1, srclen2, destlen;
519
520
0
    srclen1 = (src1) ? strlen(src1) : 0;
521
0
    srclen2 = (src2) ? strlen(src2) : 0;
522
0
    destlen = srclen1 + srclen2 + 3;
523
524
0
    if ((dest = (char *)LEPT_CALLOC(destlen, sizeof(char))) == NULL)
525
0
        return (char *)ERROR_PTR("calloc fail for dest", __func__, NULL);
526
527
0
    if (src1)
528
0
        stringCat(dest, destlen, src1);
529
0
    if (src2)
530
0
        stringCat(dest, destlen, src2);
531
0
    return dest;
532
0
}
533
534
535
/*!
536
 * \brief   stringJoinIP()
537
 *
538
 * \param[in,out]  psrc1   address of string src1; cannot be on the stack
539
 * \param[in]      src2    [optional] string; can be null
540
 * \return  0 if OK, 1 on error
541
 *
542
 * <pre>
543
 * Notes:
544
 *      (1) This is a safe in-place version of strcat.  The contents of
545
 *          src1 is replaced by the concatenation of src1 and src2.
546
 *      (2) It is not an error if either or both of the strings
547
 *          are empty (""), or if the pointers to the strings (*psrc1, src2)
548
 *          are null.
549
 *      (3) src1 should be initialized to null or an empty string
550
 *          before the first call.  Use one of these:
551
 *              char *src1 = NULL;
552
 *              char *src1 = stringNew("");
553
 *          Then call with:
554
 *              stringJoinIP(&src1, src2);
555
 *      (4) This can also be implemented as a macro:
556
 * \code
557
 *              #define stringJoinIP(src1, src2) \
558
 *                  {tmpstr = stringJoin((src1),(src2)); \
559
 *                  LEPT_FREE(src1); \
560
 *                  (src1) = tmpstr;}
561
 * \endcode
562
 *      (5) Another function to consider for joining many strings is
563
 *          stringConcatNew().
564
 * </pre>
565
 */
566
l_ok
567
stringJoinIP(char       **psrc1,
568
             const char  *src2)
569
0
{
570
0
char  *tmpstr;
571
572
0
    if (!psrc1)
573
0
        return ERROR_INT("&src1 not defined", __func__, 1);
574
575
0
    tmpstr = stringJoin(*psrc1, src2);
576
0
    LEPT_FREE(*psrc1);
577
0
    *psrc1 = tmpstr;
578
0
    return 0;
579
0
}
580
581
582
/*!
583
 * \brief   stringReverse()
584
 *
585
 * \param[in]    src    string
586
 * \return  dest newly-allocated reversed string
587
 */
588
char *
589
stringReverse(const char  *src)
590
0
{
591
0
char    *dest;
592
0
l_int32  i, len;
593
594
0
    if (!src)
595
0
        return (char *)ERROR_PTR("src not defined", __func__, NULL);
596
0
    len = strlen(src);
597
0
    if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
598
0
        return (char *)ERROR_PTR("calloc fail for dest", __func__, NULL);
599
0
    for (i = 0; i < len; i++)
600
0
        dest[i] = src[len - 1 - i];
601
602
0
    return dest;
603
0
}
604
605
606
/*!
607
 * \brief   strtokSafe()
608
 *
609
 * \param[in]    cstr      input string to be sequentially parsed;
610
 *                         use NULL after the first call
611
 * \param[in]    seps      a string of character separators
612
 * \param[out]   psaveptr  ptr to the next char after
613
 *                         the last encountered separator
614
 * \return  substr         a new string that is copied from the previous
615
 *                         saveptr up to but not including the next
616
 *                         separator character, or NULL if end of cstr.
617
 *
618
 * <pre>
619
 * Notes:
620
 *      (1) This is a thread-safe implementation of strtok.
621
 *      (2) It has the same interface as strtok_r.
622
 *      (3) It differs from strtok_r in usage in two respects:
623
 *          (a) the input string is not altered
624
 *          (b) each returned substring is newly allocated and must
625
 *              be freed after use.
626
 *      (4) Let me repeat that.  This is "safe" because the input
627
 *          string is not altered and because each returned string
628
 *          is newly allocated on the heap.
629
 *      (5) It is here because, surprisingly, some C libraries don't
630
 *          include strtok_r.
631
 *      (6) Important usage points:
632
 *          ~ Input the string to be parsed on the first invocation.
633
 *          ~ Then input NULL after that; the value returned in saveptr
634
 *            is used in all subsequent calls.
635
 *      (7) This is only slightly slower than strtok_r.
636
 * </pre>
637
 */
638
char *
639
strtokSafe(char        *cstr,
640
           const char  *seps,
641
           char       **psaveptr)
642
0
{
643
0
char     nextc;
644
0
char    *start, *substr;
645
0
l_int32  istart, i, j, nchars;
646
647
0
    if (!seps)
648
0
        return (char *)ERROR_PTR("seps not defined", __func__, NULL);
649
0
    if (!psaveptr)
650
0
        return (char *)ERROR_PTR("&saveptr not defined", __func__, NULL);
651
652
0
    if (!cstr) {
653
0
        start = *psaveptr;
654
0
    } else {
655
0
        start = cstr;
656
0
        *psaveptr = NULL;
657
0
    }
658
0
    if (!start)  /* nothing to do */
659
0
        return NULL;
660
661
        /* First time, scan for the first non-sep character */
662
0
    istart = 0;
663
0
    if (cstr) {
664
0
        for (istart = 0;; istart++) {
665
0
            if ((nextc = start[istart]) == '\0') {
666
0
                *psaveptr = NULL;  /* in case caller doesn't check ret value */
667
0
                return NULL;
668
0
            }
669
0
            if (!strchr(seps, nextc))
670
0
                break;
671
0
        }
672
0
    }
673
674
        /* Scan through, looking for a sep character; if none is
675
         * found, 'i' will be at the end of the string. */
676
0
    for (i = istart;; i++) {
677
0
        if ((nextc = start[i]) == '\0')
678
0
            break;
679
0
        if (strchr(seps, nextc))
680
0
            break;
681
0
    }
682
683
        /* Save the substring */
684
0
    nchars = i - istart;
685
0
    substr = (char *)LEPT_CALLOC(nchars + 1, sizeof(char));
686
0
    stringCopy(substr, start + istart, nchars);
687
688
        /* Look for the next non-sep character.
689
         * If this is the last substring, return a null saveptr. */
690
0
    for (j = i;; j++) {
691
0
        if ((nextc = start[j]) == '\0') {
692
0
            *psaveptr = NULL;  /* no more non-sep characters */
693
0
            break;
694
0
        }
695
0
        if (!strchr(seps, nextc)) {
696
0
            *psaveptr = start + j;  /* start here on next call */
697
0
                break;
698
0
        }
699
0
    }
700
701
0
    return substr;
702
0
}
703
704
705
/*!
706
 * \brief   stringSplitOnToken()
707
 *
708
 * \param[in]    cstr     input string to be split; not altered
709
 * \param[in]    seps     a string of character separators
710
 * \param[out]   phead    ptr to copy of the input string, up to
711
 *                        the first separator token encountered
712
 * \param[out]   ptail    ptr to copy of the part of the input string
713
 *                        starting with the first non-separator character
714
 *                        that occurs after the first separator is found
715
 * \return  0 if OK, 1 on error
716
 *
717
 * <pre>
718
 * Notes:
719
 *      (1) The input string is not altered; all split parts are new strings.
720
 *      (2) The split occurs around the first consecutive sequence of
721
 *          tokens encountered.
722
 *      (3) The head goes from the beginning of the string up to
723
 *          but not including the first token found.
724
 *      (4) The tail contains the second part of the string, starting
725
 *          with the first char in that part that is NOT a token.
726
 *      (5) If no separator token is found, 'head' contains a copy
727
 *          of the input string and 'tail' is null.
728
 * </pre>
729
 */
730
l_ok
731
stringSplitOnToken(char        *cstr,
732
                   const char  *seps,
733
                   char       **phead,
734
                   char       **ptail)
735
0
{
736
0
char  *saveptr;
737
738
0
    if (!phead)
739
0
        return ERROR_INT("&head not defined", __func__, 1);
740
0
    if (!ptail)
741
0
        return ERROR_INT("&tail not defined", __func__, 1);
742
0
    *phead = *ptail = NULL;
743
0
    if (!cstr)
744
0
        return ERROR_INT("cstr not defined", __func__, 1);
745
0
    if (!seps)
746
0
        return ERROR_INT("seps not defined", __func__, 1);
747
748
0
    *phead = strtokSafe(cstr, seps, &saveptr);
749
0
    if (saveptr)
750
0
        *ptail = stringNew(saveptr);
751
0
    return 0;
752
0
}
753
754
755
/*--------------------------------------------------------------------*
756
 *                       Find and replace procs                       *
757
 *--------------------------------------------------------------------*/
758
/*!
759
 * \brief   stringCheckForChars()
760
 *
761
 * \param[in]    src      input string; can be of zero length
762
 * \param[in]    chars    string of chars to be searched for in %src
763
 * \param[out]   pfound   1 if any characters are found; 0 otherwise
764
 * \return  0 if OK, 1 on error
765
 *
766
 * <pre>
767
 * Notes:
768
 *      (1) This can be used to sanitize an operation by checking for
769
 *          special characters that don't belong in a string.
770
 * </pre>
771
 */
772
l_ok
773
stringCheckForChars(const char  *src,
774
                    const char  *chars,
775
                    l_int32     *pfound)
776
0
{
777
0
char     ch;
778
0
l_int32  i, n;
779
780
0
    if (!pfound)
781
0
        return ERROR_INT("&found not defined", __func__, 1);
782
0
    *pfound = FALSE;
783
0
    if (!src || !chars)
784
0
        return ERROR_INT("src and chars not both defined", __func__, 1);
785
786
0
    n = strlen(src);
787
0
    for (i = 0; i < n; i++) {
788
0
        ch = src[i];
789
0
        if (strchr(chars, ch)) {
790
0
            *pfound = TRUE;
791
0
            break;
792
0
        }
793
0
    }
794
0
    return 0;
795
0
}
796
797
798
/*!
799
 * \brief   stringRemoveChars()
800
 *
801
 * \param[in]    src        input string; can be of zero length
802
 * \param[in]    remchars   string of chars to be removed from src
803
 * \return  dest string with specified chars removed, or NULL on error
804
 */
805
char *
806
stringRemoveChars(const char  *src,
807
                  const char  *remchars)
808
0
{
809
0
char     ch;
810
0
char    *dest;
811
0
l_int32  nsrc, i, k;
812
813
0
    if (!src)
814
0
        return (char *)ERROR_PTR("src not defined", __func__, NULL);
815
0
    if (!remchars)
816
0
        return stringNew(src);
817
818
0
    if ((dest = (char *)LEPT_CALLOC(strlen(src) + 1, sizeof(char))) == NULL)
819
0
        return (char *)ERROR_PTR("dest not made", __func__, NULL);
820
0
    nsrc = strlen(src);
821
0
    for (i = 0, k = 0; i < nsrc; i++) {
822
0
        ch = src[i];
823
0
        if (!strchr(remchars, ch))
824
0
            dest[k++] = ch;
825
0
    }
826
827
0
    return dest;
828
0
}
829
830
831
/*!
832
 * \brief   stringReplaceEachSubstr()
833
 *
834
 * \param[in]    src      input string; can be of zero length
835
 * \param[in]    sub1     substring to be replaced
836
 * \param[in]    sub2     substring to put in; can be ""
837
 * \param[out]   pcount   [optional] the number of times that sub1
838
 *                        is found in src; 0 if not found
839
 * \return  dest string with substring replaced, or NULL if the
840
 *              substring not found or on error.
841
 *
842
 * <pre>
843
 * Notes:
844
 *      (1) This is a wrapper for simple string substitution that uses
845
 *          the more general function arrayReplaceEachSequence().
846
 *      (2) This finds every non-overlapping occurrence of %sub1 in
847
 *          %src, and replaces it with %sub2.  By "non-overlapping"
848
 *          we mean that after it finds each match, it removes the
849
 *          matching characters, replaces with the substitution string
850
 *          (if not empty), and continues.  For example, if you replace
851
 *          'aa' by 'X' in 'baaabbb', you find one match at position 1
852
 *          and return 'bXabbb'.
853
 *      (3) To only remove each instance of sub1, use "" for sub2
854
 *      (4) Returns a copy of %src if sub1 and sub2 are the same.
855
 *      (5) If the input %src is binary data that can have null characters,
856
 *          use arrayReplaceEachSequence() directly.
857
 * </pre>
858
 */
859
char *
860
stringReplaceEachSubstr(const char  *src,
861
                        const char  *sub1,
862
                        const char  *sub2,
863
                        l_int32     *pcount)
864
0
{
865
0
size_t  datalen;
866
867
0
    if (pcount) *pcount = 0;
868
0
    if (!src || !sub1 || !sub2)
869
0
        return (char *)ERROR_PTR("src, sub1, sub2 not all defined",
870
0
                                 __func__, NULL);
871
872
0
    if (strlen(sub2) > 0) {
873
0
        return (char *)arrayReplaceEachSequence(
874
0
                               (const l_uint8 *)src, strlen(src),
875
0
                               (const l_uint8 *)sub1, strlen(sub1),
876
0
                               (const l_uint8 *)sub2, strlen(sub2),
877
0
                               &datalen, pcount);
878
0
    } else {  /* empty replacement string; removal only */
879
0
        return (char *)arrayReplaceEachSequence(
880
0
                               (const l_uint8 *)src, strlen(src),
881
0
                               (const l_uint8 *)sub1, strlen(sub1),
882
0
                               NULL, 0, &datalen, pcount);
883
0
    }
884
0
}
885
886
887
/*!
888
 * \brief   stringReplaceSubstr()
889
 *
890
 * \param[in]      src      input string; can be of zero length
891
 * \param[in]      sub1     substring to be replaced
892
 * \param[in]      sub2     substring to put in; can be ""
893
 * \param[in,out]  ploc     [optional] input start location for search;
894
 *                          returns the loc after replacement
895
 * \param[out]     pfound   [optional] 1 if sub1 is found; 0 otherwise
896
 * \return  dest string with substring replaced, or NULL on error.
897
 *
898
 * <pre>
899
 * Notes:
900
 *      (1) Replaces the first instance.
901
 *      (2) To remove sub1 without replacement, use "" for sub2.
902
 *      (3) Returns a copy of %src if either no instance of %sub1 is found,
903
 *          or if %sub1 and %sub2 are the same.
904
 *      (4) If %ploc == NULL, the search will start at the beginning of %src.
905
 *          If %ploc != NULL, *ploc must be initialized to the byte offset
906
 *          within %src from which the search starts.  To search the
907
 *          string from the beginning, set %loc = 0 and input &loc.
908
 *          After finding %sub1 and replacing it with %sub2, %loc will be
909
 *          returned as the next position after %sub2 in the output string.
910
 *      (5) Note that the output string also includes all the characters
911
 *          from the input string that occur after the single substitution.
912
 * </pre>
913
 */
914
char *
915
stringReplaceSubstr(const char  *src,
916
                    const char  *sub1,
917
                    const char  *sub2,
918
                    l_int32     *ploc,
919
                    l_int32     *pfound)
920
0
{
921
0
const char  *ptr;
922
0
char        *dest;
923
0
l_int32      nsrc, nsub1, nsub2, len, npre, loc;
924
925
0
    if (pfound) *pfound = 0;
926
0
    if (!src || !sub1 || !sub2)
927
0
        return (char *)ERROR_PTR("src, sub1, sub2 not all defined",
928
0
                                 __func__, NULL);
929
930
0
    if (ploc)
931
0
        loc = *ploc;
932
0
    else
933
0
        loc = 0;
934
0
    if (!strcmp(sub1, sub2))
935
0
        return stringNew(src);
936
0
    if ((ptr = strstr(src + loc, sub1)) == NULL)
937
0
        return stringNew(src);
938
0
    if (pfound) *pfound = 1;
939
940
0
    nsrc = strlen(src);
941
0
    nsub1 = strlen(sub1);
942
0
    nsub2 = strlen(sub2);
943
0
    len = nsrc + nsub2 - nsub1;
944
0
    if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
945
0
        return (char *)ERROR_PTR("dest not made", __func__, NULL);
946
0
    npre = ptr - src;
947
0
    memcpy(dest, src, npre);
948
0
    strcpy(dest + npre, sub2);
949
0
    strcpy(dest + npre + nsub2, ptr + nsub1);
950
0
    if (ploc) *ploc = npre + nsub2;
951
0
    return dest;
952
0
}
953
954
955
/*!
956
 * \brief   stringFindEachSubstr()
957
 *
958
 * \param[in]    src        input string; can be of zero length
959
 * \param[in]    sub        substring to be searched for
960
 * \return  dna of offsets where the sequence is found, or NULL if
961
 *              none are found or on error
962
 *
963
 * <pre>
964
 * Notes:
965
 *      (1) This finds every non-overlapping occurrence in %src of %sub.
966
 *          After it finds each match, it moves forward in %src by the length
967
 *          of %sub before continuing the search.  So for example,
968
 *          if you search for the sequence 'aa' in the data 'baaabbb',
969
 *          you find one match at position 1.
970
971
 * </pre>
972
 */
973
L_DNA *
974
stringFindEachSubstr(const char  *src,
975
                     const char  *sub)
976
0
{
977
0
    if (!src || !sub)
978
0
        return (L_DNA *)ERROR_PTR("src, sub not both defined", __func__, NULL);
979
980
0
    return arrayFindEachSequence((const l_uint8 *)src, strlen(src),
981
0
                                 (const l_uint8 *)sub, strlen(sub));
982
0
}
983
984
985
/*!
986
 * \brief   stringFindSubstr()
987
 *
988
 * \param[in]    src     input string; can be of zero length
989
 * \param[in]    sub     substring to be searched for; must not be empty
990
 * \param[out]   ploc    [optional] location of substring in src
991
 * \return  1 if found; 0 if not found or on error
992
 *
993
 * <pre>
994
 * Notes:
995
 *      (1) This is a wrapper around strstr().  It finds the first
996
 *          instance of %sub in %src.  If the substring is not found
997
 *          and the location is returned, it has the value -1.
998
 *      (2) Both %src and %sub must be defined, and %sub must have
999
 *          length of at least 1.
1000
 * </pre>
1001
 */
1002
l_int32
1003
stringFindSubstr(const char  *src,
1004
                 const char  *sub,
1005
                 l_int32     *ploc)
1006
0
{
1007
0
const char *ptr;
1008
1009
0
    if (ploc) *ploc = -1;
1010
0
    if (!src || !sub)
1011
0
        return ERROR_INT("src and sub not both defined", __func__, 0);
1012
0
    if (strlen(sub) == 0)
1013
0
        return ERROR_INT("substring length 0", __func__, 0);
1014
0
    if (strlen(src) == 0)
1015
0
        return 0;
1016
1017
0
    if ((ptr = strstr(src, sub)) == NULL)  /* not found */
1018
0
        return 0;
1019
1020
0
    if (ploc)
1021
0
        *ploc = ptr - src;
1022
0
    return 1;
1023
0
}
1024
1025
1026
/*!
1027
 * \brief   arrayReplaceEachSequence()
1028
 *
1029
 * \param[in]    datas       source byte array
1030
 * \param[in]    dataslen    length of source data, in bytes
1031
 * \param[in]    seq         subarray of bytes to find in source data
1032
 * \param[in]    seqlen      length of subarray, in bytes
1033
 * \param[in]    newseq      replacement subarray; can be null
1034
 * \param[in]    newseqlen   length of replacement subarray, in bytes
1035
 * \param[out]   pdatadlen   length of dest byte array, in bytes
1036
 * \param[out]   pcount      [optional] the number of times that sub1
1037
 *                           is found in src; 0 if not found
1038
 * \return  datad   with all all subarrays replaced (or removed)
1039
 *
1040
 * <pre>
1041
 * Notes:
1042
 *      (1) The byte arrays %datas, %seq and %newseq are not C strings,
1043
 *          because they can contain null bytes.  Therefore, for each
1044
 *          we must give the length of the array.
1045
 *      (2) If %newseq == NULL, this just removes all instances of %seq.
1046
 *          Otherwise, it replaces every non-overlapping occurrence of
1047
 *          %seq in %datas with %newseq. A new array %datad and its
1048
 *          size are returned.  See arrayFindEachSequence() for more
1049
 *          details on finding non-overlapping occurrences.
1050
 *      (3) If no instances of %seq are found, this returns a copy of %datas.
1051
 *      (4) The returned %datad is null terminated.
1052
 *      (5) Can use stringReplaceEachSubstr() if using C strings.
1053
 * </pre>
1054
 */
1055
l_uint8 *
1056
arrayReplaceEachSequence(const l_uint8  *datas,
1057
                         size_t          dataslen,
1058
                         const l_uint8  *seq,
1059
                         size_t          seqlen,
1060
                         const l_uint8  *newseq,
1061
                         size_t          newseqlen,
1062
                         size_t         *pdatadlen,
1063
                         l_int32        *pcount)
1064
0
{
1065
0
l_uint8  *datad;
1066
0
size_t    newsize;
1067
0
l_int32   n, i, j, di, si, index, incr;
1068
0
L_DNA    *da;
1069
1070
0
    if (pcount) *pcount = 0;
1071
0
    if (!datas || !seq)
1072
0
        return (l_uint8 *)ERROR_PTR("datas & seq not both defined",
1073
0
                                    __func__, NULL);
1074
0
    if (!pdatadlen)
1075
0
        return (l_uint8 *)ERROR_PTR("&datadlen not defined", __func__, NULL);
1076
0
    *pdatadlen = 0;
1077
1078
        /* Identify the locations of the sequence.  If there are none,
1079
         * return a copy of %datas. */
1080
0
    if ((da = arrayFindEachSequence(datas, dataslen, seq, seqlen)) == NULL) {
1081
0
        *pdatadlen = dataslen;
1082
0
        return l_binaryCopy(datas, dataslen);
1083
0
    }
1084
1085
        /* Allocate the output data; insure null termination */
1086
0
    n = l_dnaGetCount(da);
1087
0
    if (pcount) *pcount = n;
1088
0
    if (!newseq) newseqlen = 0;
1089
0
    newsize = dataslen + n * (newseqlen - seqlen) + 4;
1090
0
    if ((datad = (l_uint8 *)LEPT_CALLOC(newsize, sizeof(l_uint8))) == NULL) {
1091
0
        l_dnaDestroy(&da);
1092
0
        return (l_uint8 *)ERROR_PTR("datad not made", __func__, NULL);
1093
0
    }
1094
1095
        /* Replace each sequence instance with a new sequence */
1096
0
    l_dnaGetIValue(da, 0, &si);
1097
0
    for (i = 0, di = 0, index = 0; i < dataslen; i++) {
1098
0
        if (i == si) {
1099
0
            index++;
1100
0
            if (index < n) {
1101
0
                l_dnaGetIValue(da, index, &si);
1102
0
                incr = L_MIN(seqlen, si - i);  /* amount to remove from datas */
1103
0
            } else {
1104
0
                incr = seqlen;
1105
0
            }
1106
0
            i += incr - 1;  /* jump over the matched sequence in datas */
1107
0
            if (newseq) {  /* add new sequence to datad */
1108
0
                for (j = 0; j < newseqlen; j++)
1109
0
                    datad[di++] = newseq[j];
1110
0
            }
1111
0
        } else {
1112
0
            datad[di++] = datas[i];
1113
0
        }
1114
0
    }
1115
1116
0
    *pdatadlen = di;
1117
0
    l_dnaDestroy(&da);
1118
0
    return datad;
1119
0
}
1120
1121
1122
/*!
1123
 * \brief   arrayFindEachSequence()
1124
 *
1125
 * \param[in]    data       byte array
1126
 * \param[in]    datalen    length of data, in bytes
1127
 * \param[in]    sequence   subarray of bytes to find in data
1128
 * \param[in]    seqlen     length of sequence, in bytes
1129
 * \return  dna of offsets where the sequence is found, or NULL if
1130
 *              none are found or on error
1131
 *
1132
 * <pre>
1133
 * Notes:
1134
 *      (1) The byte arrays %data and %sequence are not C strings,
1135
 *          because they can contain null bytes.  Therefore, for each
1136
 *          we must give the length of the array.
1137
 *      (2) This finds every non-overlapping occurrence in %data of %sequence.
1138
 *          After it finds each match, it moves forward by the length
1139
 *          of the sequence before continuing the search.  So for example,
1140
 *          if you search for the sequence 'aa' in the data 'baaabbb',
1141
 *          you find one match at position 1.
1142
 * </pre>
1143
 */
1144
L_DNA *
1145
arrayFindEachSequence(const l_uint8  *data,
1146
                      size_t          datalen,
1147
                      const l_uint8  *sequence,
1148
                      size_t          seqlen)
1149
0
{
1150
0
l_int32  start, offset, realoffset, found;
1151
0
L_DNA   *da;
1152
1153
0
    if (!data || !sequence)
1154
0
        return (L_DNA *)ERROR_PTR("data & sequence not both defined",
1155
0
                                  __func__, NULL);
1156
1157
0
    da = l_dnaCreate(0);
1158
0
    start = 0;
1159
0
    while (1) {
1160
0
        arrayFindSequence(data + start, datalen - start, sequence, seqlen,
1161
0
                          &offset, &found);
1162
0
        if (found == FALSE)
1163
0
            break;
1164
1165
0
        realoffset = start + offset;
1166
0
        l_dnaAddNumber(da, realoffset);
1167
0
        start = realoffset + seqlen;
1168
0
        if (start >= datalen)
1169
0
            break;
1170
0
    }
1171
1172
0
    if (l_dnaGetCount(da) == 0)
1173
0
        l_dnaDestroy(&da);
1174
0
    return da;
1175
0
}
1176
1177
1178
/*!
1179
 * \brief   arrayFindSequence()
1180
 *
1181
 * \param[in]    data       byte array
1182
 * \param[in]    datalen    length of data, in bytes
1183
 * \param[in]    sequence   subarray of bytes to find in data
1184
 * \param[in]    seqlen     length of sequence, in bytes
1185
 * \param[out]   poffset    offset from beginning of
1186
 *                          data where the sequence begins
1187
 * \param[out]   pfound     1 if sequence is found; 0 otherwise
1188
 * \return  0 if OK, 1 on error
1189
 *
1190
 * <pre>
1191
 * Notes:
1192
 *      (1) The byte arrays 'data' and 'sequence' are in general not C strings,
1193
 *          because they can contain null bytes.  Therefore, for each
1194
 *          we must give the length of the array.
1195
 *      (2) This searches for the first occurrence in %data of %sequence,
1196
 *          which consists of %seqlen bytes.  The parameter %seqlen
1197
 *          must not exceed the actual length of the %sequence byte array.
1198
 *      (3) If either byte array is a C string, cast the array to
1199
 *          (const l_uint8 *) and use strlen() on the string for its length.
1200
 *      (4) If the sequence is not found, the offset will be 0, so you
1201
 *          must check %found.
1202
 * </pre>
1203
 */
1204
l_ok
1205
arrayFindSequence(const l_uint8  *data,
1206
                  size_t          datalen,
1207
                  const l_uint8  *sequence,
1208
                  size_t          seqlen,
1209
                  l_int32        *poffset,
1210
                  l_int32        *pfound)
1211
0
{
1212
0
l_int32  i, j, found, lastpos;
1213
1214
0
    if (poffset) *poffset = 0;
1215
0
    if (pfound) *pfound = FALSE;
1216
0
    if (!data || !sequence)
1217
0
        return ERROR_INT("data & sequence not both defined", __func__, 1);
1218
0
    if (!poffset || !pfound)
1219
0
        return ERROR_INT("&offset and &found not defined", __func__, 1);
1220
1221
0
    lastpos = datalen - seqlen + 1;
1222
0
    found = FALSE;
1223
0
    for (i = 0; i < lastpos; i++) {
1224
0
        for (j = 0; j < seqlen; j++) {
1225
0
            if (data[i + j] != sequence[j])
1226
0
                 break;
1227
0
            if (j == seqlen - 1)
1228
0
                 found = TRUE;
1229
0
        }
1230
0
        if (found == TRUE)
1231
0
            break;
1232
0
    }
1233
1234
0
    if (found == TRUE) {
1235
0
        *poffset = i;
1236
0
        *pfound = TRUE;
1237
0
    }
1238
0
    return 0;
1239
0
}
1240
1241
1242
/*--------------------------------------------------------------------*
1243
 *                             Safe realloc                           *
1244
 *--------------------------------------------------------------------*/
1245
/*!
1246
 * \brief   reallocNew()
1247
 *
1248
 * \param[in,out]  pindata    nulls indata before reallocing
1249
 * \param[in]      oldsize    size of input data to be copied, in bytes
1250
 * \param[in]      newsize    size of buffer to be reallocated in bytes
1251
 * \return  ptr to new data, or NULL on error
1252
 *
1253
 *  Action: !N.B. 3) and (4!
1254
 *      1 Allocates memory, initialized to 0
1255
 *      2 Copies as much of the input data as possible
1256
 *          to the new block, truncating the copy if necessary
1257
 *      3 Frees the input data
1258
 *      4 Zeroes the input data ptr
1259
 *
1260
 * <pre>
1261
 * Notes:
1262
 *      (1) If newsize == 0, frees input data and nulls ptr
1263
 *      (2) If input data is null, only callocs new memory
1264
 *      (3) This differs from realloc in that it always allocates
1265
 *          new memory (if newsize > 0) and initializes it to 0,
1266
 *          it requires the amount of old data to be copied,
1267
 *          and it takes the address of the input ptr and
1268
 *          nulls the handle.
1269
 * </pre>
1270
 */
1271
void *
1272
reallocNew(void  **pindata,
1273
           size_t  oldsize,
1274
           size_t  newsize)
1275
24
{
1276
24
size_t   minsize;
1277
24
void    *indata;
1278
24
void    *newdata;
1279
1280
24
    if (!pindata)
1281
0
        return ERROR_PTR("input data not defined", __func__, NULL);
1282
24
    indata = *pindata;
1283
1284
24
    if (newsize == 0) {   /* nonstandard usage */
1285
0
        if (indata) {
1286
0
            LEPT_FREE(indata);
1287
0
            *pindata = NULL;
1288
0
        }
1289
0
        return NULL;
1290
0
    }
1291
1292
24
    if (!indata) {  /* nonstandard usage */
1293
0
        if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL)
1294
0
            return ERROR_PTR("newdata not made", __func__, NULL);
1295
0
        return newdata;
1296
0
    }
1297
1298
        /* Standard usage */
1299
24
    if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL)
1300
0
        return ERROR_PTR("newdata not made", __func__, NULL);
1301
24
    minsize = L_MIN(oldsize, newsize);
1302
24
    memcpy(newdata, indata, minsize);
1303
24
    LEPT_FREE(indata);
1304
24
    *pindata = NULL;
1305
24
    return newdata;
1306
24
}
1307
1308
1309
/*--------------------------------------------------------------------*
1310
 *                 Read and write between file and memory             *
1311
 *--------------------------------------------------------------------*/
1312
/*!
1313
 * \brief   l_binaryRead()
1314
 *
1315
 * \param[in]    filename
1316
 * \param[out]   pnbytes    number of bytes read
1317
 * \return  data, or NULL on error
1318
 */
1319
l_uint8 *
1320
l_binaryRead(const char  *filename,
1321
             size_t      *pnbytes)
1322
0
{
1323
0
l_uint8  *data;
1324
0
FILE     *fp;
1325
1326
0
    if (!pnbytes)
1327
0
        return (l_uint8 *)ERROR_PTR("pnbytes not defined", __func__, NULL);
1328
0
    *pnbytes = 0;
1329
0
    if (!filename)
1330
0
        return (l_uint8 *)ERROR_PTR("filename not defined", __func__, NULL);
1331
1332
0
    if ((fp = fopenReadStream(filename)) == NULL)
1333
0
        return (l_uint8 *)ERROR_PTR_1("file stream not opened",
1334
0
                                      filename, __func__, NULL);
1335
0
    data = l_binaryReadStream(fp, pnbytes);
1336
0
    fclose(fp);
1337
0
    return data;
1338
0
}
1339
1340
1341
/*!
1342
 * \brief   l_binaryReadStream()
1343
 *
1344
 * \param[in]    fp        file stream opened to read; can be stdin
1345
 * \param[out]   pnbytes   number of bytes read
1346
 * \return  null-terminated array, or NULL on error; reading 0 bytes
1347
 *          is not an error
1348
 *
1349
 * <pre>
1350
 * Notes:
1351
 *      (1) The returned array is terminated with a null byte so that it can
1352
 *          be used to read ascii data from a file into a proper C string.
1353
 *      (2) This can be used to capture data that is piped in via stdin,
1354
 *          because it does not require seeking within the file.
1355
 *      (3) For example, you can read an image from stdin into memory
1356
 *          using shell redirection, with one of these shell commands:
1357
 * \code
1358
 *             cat <imagefile> | readprog
1359
 *             readprog < <imagefile>
1360
 * \endcode
1361
 *          where readprog is:
1362
 * \code
1363
 *             l_uint8 *data = l_binaryReadStream(stdin, &nbytes);
1364
 *             Pix *pix = pixReadMem(data, nbytes);
1365
 * \endcode
1366
 * </pre>
1367
 */
1368
l_uint8 *
1369
l_binaryReadStream(FILE    *fp,
1370
                   size_t  *pnbytes)
1371
0
{
1372
0
l_uint8    *data;
1373
0
l_int32     seekable, navail, nadd, nread;
1374
0
L_BBUFFER  *bb;
1375
1376
0
    if (!pnbytes)
1377
0
        return (l_uint8 *)ERROR_PTR("&nbytes not defined", __func__, NULL);
1378
0
    *pnbytes = 0;
1379
0
    if (!fp)
1380
0
        return (l_uint8 *)ERROR_PTR("fp not defined", __func__, NULL);
1381
1382
        /* Test if the stream is seekable, by attempting to seek to
1383
         * the start of data.  This is a no-op.  If it is seekable, use
1384
         * l_binaryReadSelectStream() to determine the size of the
1385
         * data to be read in advance. */
1386
0
    seekable = (ftell(fp) == 0) ? 1 : 0;
1387
0
    if (seekable)
1388
0
        return l_binaryReadSelectStream(fp, 0, 0, pnbytes);
1389
1390
        /* If it is not seekable, use the bbuffer to realloc memory
1391
         * as needed during reading. */
1392
0
    bb = bbufferCreate(NULL, 4096);
1393
0
    while (1) {
1394
0
        navail = bb->nalloc - bb->n;
1395
0
        if (navail < 4096) {
1396
0
             nadd = L_MAX(bb->nalloc, 4096);
1397
0
             bbufferExtendArray(bb, nadd);
1398
0
        }
1399
0
        nread = fread((void *)(bb->array + bb->n), 1, 4096, fp);
1400
0
        bb->n += nread;
1401
0
        if (nread != 4096) break;
1402
0
    }
1403
1404
        /* Copy the data to a new array sized for the data, because
1405
         * the bbuffer array can be nearly twice the size we need. */
1406
0
    if ((data = (l_uint8 *)LEPT_CALLOC(bb->n + 1, sizeof(l_uint8))) != NULL) {
1407
0
        memcpy(data, bb->array, bb->n);
1408
0
        *pnbytes = bb->n;
1409
0
    } else {
1410
0
        L_ERROR("calloc fail for data\n", __func__);
1411
0
    }
1412
1413
0
    bbufferDestroy(&bb);
1414
0
    return data;
1415
0
}
1416
1417
1418
/*!
1419
 * \brief   l_binaryReadSelect()
1420
 *
1421
 * \param[in]    filename
1422
 * \param[in]    start     first byte to read
1423
 * \param[in]    nbytes    number of bytes to read; use 0 to read to end of file
1424
 * \param[out]   pnread    number of bytes actually read
1425
 * \return  data, or NULL on error
1426
 *
1427
 * <pre>
1428
 * Notes:
1429
 *      (1) The returned array is terminated with a null byte so that it can
1430
 *          be used to read ascii data from a file into a proper C string.
1431
 * </pre>
1432
 */
1433
l_uint8 *
1434
l_binaryReadSelect(const char  *filename,
1435
                   size_t       start,
1436
                   size_t       nbytes,
1437
                   size_t      *pnread)
1438
0
{
1439
0
l_uint8  *data;
1440
0
FILE     *fp;
1441
1442
0
    if (!pnread)
1443
0
        return (l_uint8 *)ERROR_PTR("pnread not defined", __func__, NULL);
1444
0
    *pnread = 0;
1445
0
    if (!filename)
1446
0
        return (l_uint8 *)ERROR_PTR("filename not defined", __func__, NULL);
1447
1448
0
    if ((fp = fopenReadStream(filename)) == NULL)
1449
0
        return (l_uint8 *)ERROR_PTR_1("file stream not opened",
1450
0
                                      filename, __func__, NULL);
1451
0
    data = l_binaryReadSelectStream(fp, start, nbytes, pnread);
1452
0
    fclose(fp);
1453
0
    return data;
1454
0
}
1455
1456
1457
/*!
1458
 * \brief   l_binaryReadSelectStream()
1459
 *
1460
 * \param[in]    fp       file stream
1461
 * \param[in]    start    first byte to read
1462
 * \param[in]    nbytes   number of bytes to read; use 0 to read to end of file
1463
 * \param[out]   pnread   number of bytes actually read
1464
 * \return  null-terminated array, or NULL on error; reading 0 bytes
1465
 *          is not an error
1466
 *
1467
 * <pre>
1468
 * Notes:
1469
 *      (1) The returned array is terminated with a null byte so that it can
1470
 *          be used to read ascii data from a file into a proper C string.
1471
 *          If the file to be read is empty and %start == 0, an array
1472
 *          with a single null byte is returned.
1473
 *      (2) Side effect: the stream pointer is re-positioned to the
1474
 *          beginning of the file.
1475
 * </pre>
1476
 */
1477
l_uint8 *
1478
l_binaryReadSelectStream(FILE    *fp,
1479
                         size_t   start,
1480
                         size_t   nbytes,
1481
                         size_t  *pnread)
1482
0
{
1483
0
l_uint8  *data;
1484
0
size_t    bytesleft, bytestoread, nread, filebytes;
1485
1486
0
    if (!pnread)
1487
0
        return (l_uint8 *)ERROR_PTR("&nread not defined", __func__, NULL);
1488
0
    *pnread = 0;
1489
0
    if (!fp)
1490
0
        return (l_uint8 *)ERROR_PTR("stream not defined", __func__, NULL);
1491
1492
        /* Verify and adjust the parameters if necessary */
1493
0
    fseek(fp, 0, SEEK_END);  /* EOF */
1494
0
    filebytes = ftell(fp);
1495
0
    fseek(fp, 0, SEEK_SET);
1496
0
    if (start > filebytes) {
1497
0
        L_ERROR("start = %zu but filebytes = %zu\n", __func__,
1498
0
                start, filebytes);
1499
0
        return NULL;
1500
0
    }
1501
0
    if (filebytes == 0)  /* start == 0; nothing to read; return null byte */
1502
0
        return (l_uint8 *)LEPT_CALLOC(1, 1);
1503
0
    bytesleft = filebytes - start;  /* greater than 0 */
1504
0
    if (nbytes == 0) nbytes = bytesleft;
1505
0
    bytestoread = (bytesleft >= nbytes) ? nbytes : bytesleft;
1506
1507
        /* Read the data */
1508
0
    if ((data = (l_uint8 *)LEPT_CALLOC(1, bytestoread + 1)) == NULL)
1509
0
        return (l_uint8 *)ERROR_PTR("calloc fail for data", __func__, NULL);
1510
0
    fseek(fp, start, SEEK_SET);
1511
0
    nread = fread(data, 1, bytestoread, fp);
1512
0
    if (nbytes != nread)
1513
0
        L_INFO("%zu bytes requested; %zu bytes read\n", __func__,
1514
0
               nbytes, nread);
1515
0
    *pnread = nread;
1516
0
    fseek(fp, 0, SEEK_SET);
1517
0
    return data;
1518
0
}
1519
1520
1521
/*!
1522
 * \brief   l_binaryWrite()
1523
 *
1524
 * \param[in]    filename     output file
1525
 * \param[in]    operation    "w" for write; "a" for append
1526
 * \param[in]    data         binary data to be written
1527
 * \param[in]    nbytes       size of data array
1528
 * \return  0 if OK; 1 on error
1529
 */
1530
l_ok
1531
l_binaryWrite(const char  *filename,
1532
              const char  *operation,
1533
              const void  *data,
1534
              size_t       nbytes)
1535
0
{
1536
0
char   actualOperation[20];
1537
0
FILE  *fp;
1538
1539
0
    if (!filename)
1540
0
        return ERROR_INT("filename not defined", __func__, 1);
1541
0
    if (!operation)
1542
0
        return ERROR_INT("operation not defined", __func__, 1);
1543
0
    if (!data)
1544
0
        return ERROR_INT("data not defined", __func__, 1);
1545
0
    if (nbytes <= 0)
1546
0
        return ERROR_INT("nbytes must be > 0", __func__, 1);
1547
1548
0
    if (strcmp(operation, "w") && strcmp(operation, "a"))
1549
0
        return ERROR_INT("operation not one of {'w','a'}", __func__, 1);
1550
1551
        /* The 'b' flag to fopen() is ignored for all POSIX
1552
         * conforming systems.  However, Windows needs the 'b' flag. */
1553
0
    stringCopy(actualOperation, operation, 2);
1554
0
    stringCat(actualOperation, 20, "b");
1555
1556
0
    if ((fp = fopenWriteStream(filename, actualOperation)) == NULL)
1557
0
        return ERROR_INT_1("stream not opened", filename, __func__, 1);
1558
0
    fwrite(data, 1, nbytes, fp);
1559
0
    fclose(fp);
1560
0
    return 0;
1561
0
}
1562
1563
1564
/*!
1565
 * \brief   nbytesInFile()
1566
 *
1567
 * \param[in]    filename
1568
 * \return  nbytes in file; 0 on error
1569
 */
1570
size_t
1571
nbytesInFile(const char  *filename)
1572
0
{
1573
0
size_t  nbytes;
1574
0
FILE   *fp;
1575
1576
0
    if (!filename)
1577
0
        return ERROR_INT("filename not defined", __func__, 0);
1578
0
    if ((fp = fopenReadStream(filename)) == NULL)
1579
0
        return ERROR_INT_1("stream not opened", filename, __func__, 0);
1580
0
    nbytes = fnbytesInFile(fp);
1581
0
    fclose(fp);
1582
0
    return nbytes;
1583
0
}
1584
1585
1586
/*!
1587
 * \brief   fnbytesInFile()
1588
 *
1589
 * \param[in]    fp    file stream
1590
 * \return  nbytes in file; 0 on error
1591
 */
1592
size_t
1593
fnbytesInFile(FILE  *fp)
1594
0
{
1595
0
l_int64  pos, nbytes;
1596
1597
0
    if (!fp)
1598
0
        return ERROR_INT("stream not open", __func__, 0);
1599
1600
0
    pos = ftell(fp);          /* initial position */
1601
0
    if (pos < 0)
1602
0
        return ERROR_INT("seek position must be > 0", __func__, 0);
1603
0
    fseek(fp, 0, SEEK_END);   /* EOF */
1604
0
    nbytes = ftell(fp);
1605
0
    if (nbytes < 0)
1606
0
        return ERROR_INT("nbytes is < 0", __func__, 0);
1607
0
    fseek(fp, pos, SEEK_SET);        /* back to initial position */
1608
0
    return nbytes;
1609
0
}
1610
1611
1612
/*--------------------------------------------------------------------*
1613
 *                     Copy and compare in memory                     *
1614
 *--------------------------------------------------------------------*/
1615
/*!
1616
 * \brief   l_binaryCopy()
1617
 *
1618
 * \param[in]    datas
1619
 * \param[in]    size    of data array
1620
 * \return  datad on heap, or NULL on error
1621
 *
1622
 * <pre>
1623
 * Notes:
1624
 *      (1) We add 4 bytes to the zeroed output because in some cases
1625
 *          (e.g., string handling) it is important to have the data
1626
 *          be null terminated.  This guarantees that after the memcpy,
1627
 *          the result is automatically null terminated.
1628
 * </pre>
1629
 */
1630
l_uint8 *
1631
l_binaryCopy(const l_uint8  *datas,
1632
             size_t          size)
1633
0
{
1634
0
l_uint8  *datad;
1635
1636
0
    if (!datas)
1637
0
        return (l_uint8 *)ERROR_PTR("datas not defined", __func__, NULL);
1638
1639
0
    if ((datad = (l_uint8 *)LEPT_CALLOC(size + 4, sizeof(l_uint8))) == NULL)
1640
0
        return (l_uint8 *)ERROR_PTR("datad not made", __func__, NULL);
1641
0
    memcpy(datad, datas, size);
1642
0
    return datad;
1643
0
}
1644
1645
1646
/*!
1647
 * \brief   l_binaryCompare()
1648
 *
1649
 * \param[in]    data1
1650
 * \param[in]    size1   of data1
1651
 * \param[in]    data2
1652
 * \param[in]    size2   of data1
1653
 * \param[out]   psame  (1 if the same, 0 if different)
1654
 * \return  0 if OK, 1 on error
1655
 *
1656
 * <pre>
1657
 * Notes:
1658
 *      (1) This can also be used to compare C strings str1 and str2.
1659
 *          If the string lengths are not known, use strlen():
1660
 *            l_binaryCompare((l_uint8 *)str1, strlen(str1),
1661
                              (l_uint8 *)str2, strlen(str2));
1662
 * </pre>
1663
 */
1664
l_ok
1665
l_binaryCompare(const l_uint8  *data1,
1666
                size_t          size1,
1667
                const l_uint8  *data2,
1668
                size_t          size2,
1669
                l_int32        *psame)
1670
0
{
1671
0
l_int32  i;
1672
1673
0
    if (!psame)
1674
0
        return ERROR_INT("&same not defined", __func__, 1);
1675
0
    *psame = FALSE;
1676
0
    if (!data1 || !data2)
1677
0
        return ERROR_INT("data1 and data2 not both defined", __func__, 1);
1678
0
    if (size1 != size2) return 0;
1679
0
    for (i = 0; i < size1; i++) {
1680
0
        if (data1[i] != data2[i])
1681
0
            return 0;
1682
0
    }
1683
0
    *psame = TRUE;
1684
0
    return 0;
1685
0
}
1686
1687
1688
/*--------------------------------------------------------------------*
1689
 *                         File copy operations                       *
1690
 *--------------------------------------------------------------------*/
1691
/*!
1692
 * \brief   fileCopy()
1693
 *
1694
 * \param[in]    srcfile   copy from this file
1695
 * \param[in]    newfile   copy to this file
1696
 * \return  0 if OK, 1 on error
1697
 */
1698
l_ok
1699
fileCopy(const char  *srcfile,
1700
         const char  *newfile)
1701
0
{
1702
0
l_int32   ret;
1703
0
size_t    nbytes;
1704
0
l_uint8  *data;
1705
1706
0
    if (!srcfile)
1707
0
        return ERROR_INT("srcfile not defined", __func__, 1);
1708
0
    if (!newfile)
1709
0
        return ERROR_INT("newfile not defined", __func__, 1);
1710
1711
0
    if ((data = l_binaryRead(srcfile, &nbytes)) == NULL)
1712
0
        return ERROR_INT("data not returned", __func__, 1);
1713
0
    ret = l_binaryWrite(newfile, "w", data, nbytes);
1714
0
    LEPT_FREE(data);
1715
0
    return ret;
1716
0
}
1717
1718
1719
/*!
1720
 * \brief   fileConcatenate()
1721
 *
1722
 * \param[in]    srcfile   append data from this file
1723
 * \param[in]    destfile  add data to this file
1724
 * \return  0 if OK, 1 on error
1725
 */
1726
l_ok
1727
fileConcatenate(const char  *srcfile,
1728
                const char  *destfile)
1729
0
{
1730
0
size_t    nbytes;
1731
0
l_uint8  *data;
1732
1733
0
    if (!srcfile)
1734
0
        return ERROR_INT("srcfile not defined", __func__, 1);
1735
0
    if (!destfile)
1736
0
        return ERROR_INT("destfile not defined", __func__, 1);
1737
1738
0
    data = l_binaryRead(srcfile, &nbytes);
1739
0
    l_binaryWrite(destfile, "a", data, nbytes);
1740
0
    LEPT_FREE(data);
1741
0
    return 0;
1742
0
}
1743
1744
1745
/*!
1746
 * \brief   fileAppendString()
1747
 *
1748
 * \param[in]    filename
1749
 * \param[in]    str       string to append to file
1750
 * \return  0 if OK, 1 on error
1751
 */
1752
l_ok
1753
fileAppendString(const char  *filename,
1754
                 const char  *str)
1755
0
{
1756
0
FILE  *fp;
1757
1758
0
    if (!filename)
1759
0
        return ERROR_INT("filename not defined", __func__, 1);
1760
0
    if (!str)
1761
0
        return ERROR_INT("str not defined", __func__, 1);
1762
1763
0
    if ((fp = fopenWriteStream(filename, "a")) == NULL)
1764
0
        return ERROR_INT_1("stream not opened", filename, __func__, 1);
1765
0
    fprintf(fp, "%s", str);
1766
0
    fclose(fp);
1767
0
    return 0;
1768
0
}
1769
1770
1771
/*--------------------------------------------------------------------*
1772
 *                         File split operations                      *
1773
 *--------------------------------------------------------------------*/
1774
/*!
1775
 * \brief   fileSplitLinesUniform()
1776
 *
1777
 * \param[in]    filename      input file
1778
 * \param[in]    n             number of output files (>= 1)
1779
 * \param[in]    save_empty    1 to save empty lines; 0 to remove them
1780
 * \param[in]    rootpath      root pathname of output files
1781
 * \param[in]    ext           output extension, including the '.'; can be NULL
1782
 * \return  0 if OK, 1 on error
1783
 *
1784
 * <pre>
1785
 * Notes:
1786
 *      (1) This splits an input text file into %n files with roughly
1787
 *          equal numbers of text lines in each file.
1788
 *      (2) if %save_empty == 1, empty lines are included, and concatention
1789
 *          of the text in the split files will be identical to the original.
1790
 *      (3) The output filenames are in the form:
1791
 *               <rootpath>_N.<ext>, N = 1, ... n
1792
 *      (4) This handles the temp directory pathname conversion where needed:
1793
 *              /tmp  ==>  [OS specific temp directory]
1794
 *      (5) Files can also be sharded into sets of lines by the program 'split':
1795
 *              split -n l/<n> <filename>
1796
 *          Using 'split', the resulting files have approximately equal
1797
 *          numbers of bytes, rather than equal numbers of lines.
1798
 * </pre>
1799
 */
1800
l_ok
1801
fileSplitLinesUniform(const char  *filename,
1802
                      l_int32      n,
1803
                      l_int32      save_empty,
1804
                      const char  *rootpath,
1805
                      const char  *ext)
1806
0
{
1807
0
l_int32   i, totlines, nlines, index;
1808
0
size_t    nbytes;
1809
0
l_uint8  *data;
1810
0
char     *str;
1811
0
char      outname[512];
1812
0
NUMA     *na;
1813
0
SARRAY   *sa;
1814
1815
0
    if (!filename)
1816
0
        return ERROR_INT("filename not defined", __func__, 1);
1817
0
    if (!rootpath)
1818
0
        return ERROR_INT("rootpath not defined", __func__, 1);
1819
0
    if (n <= 0)
1820
0
        return ERROR_INT("n must be > 0", __func__, 1);
1821
0
    if (save_empty != 0 && save_empty != 1)
1822
0
        return ERROR_INT("save_empty not 0 or 1", __func__, 1);
1823
1824
        /* Make sarray of lines; the newlines are stripped off */
1825
0
    if ((data = l_binaryRead(filename, &nbytes)) == NULL)
1826
0
        return ERROR_INT("data not read", __func__, 1);
1827
0
    sa = sarrayCreateLinesFromString((const char *)data, save_empty);
1828
0
    LEPT_FREE(data);
1829
0
    if (!sa)
1830
0
        return ERROR_INT("sa not made", __func__, 1);
1831
0
    totlines = sarrayGetCount(sa);
1832
0
    if (n > totlines) {
1833
0
        sarrayDestroy(&sa);
1834
0
        L_ERROR("num files = %d > num lines = %d\n", __func__, n, totlines);
1835
0
        return 1;
1836
0
    }
1837
1838
        /* Write n sets of lines to n files, adding the newlines back */
1839
0
    na = numaGetUniformBinSizes(totlines, n);
1840
0
    index = 0;
1841
0
    for (i = 0; i < n; i++) {
1842
0
        if (ext == NULL)
1843
0
            snprintf(outname, sizeof(outname), "%s_%d", rootpath, i);
1844
0
        else
1845
0
            snprintf(outname, sizeof(outname), "%s_%d%s", rootpath, i, ext);
1846
0
        numaGetIValue(na, i, &nlines);
1847
0
        str = sarrayToStringRange(sa, index, nlines, 1);  /* add newlines */
1848
0
        l_binaryWrite(outname, "w", str, strlen(str));
1849
0
        LEPT_FREE(str);
1850
0
        index += nlines;
1851
0
    }
1852
0
    numaDestroy(&na);
1853
0
    sarrayDestroy(&sa);
1854
0
    return 0;
1855
0
}
1856
1857
1858
/*--------------------------------------------------------------------*
1859
 *          Multi-platform functions for opening file streams         *
1860
 *--------------------------------------------------------------------*/
1861
/*!
1862
 * \brief   fopenReadStream()
1863
 *
1864
 * \param[in]    filename
1865
 * \return  stream, or NULL on error
1866
 *
1867
 * <pre>
1868
 * Notes:
1869
 *      (1) This should be used whenever you want to run fopen() to
1870
 *          read from a stream.  Never call fopen() directory.
1871
 *      (2) This handles the temp directory pathname conversion where needed:
1872
 *              /tmp  ==>  [OS specific temp directory]
1873
 * </pre>
1874
 */
1875
FILE *
1876
fopenReadStream(const char  *filename)
1877
0
{
1878
0
char   *stripped_name, *fname, *tail;
1879
0
size_t  len;
1880
0
FILE   *fp;
1881
1882
0
    if (!filename)
1883
0
        return (FILE *)ERROR_PTR("filename not defined", __func__, NULL);
1884
0
    if ((len = strlen(filename)) == 0)
1885
0
        return (FILE *)ERROR_PTR_1("filename length 0", filename, __func__,
1886
0
                                   NULL);
1887
1888
        /* Try input filename */
1889
0
    fname = genPathname(filename, NULL);
1890
0
    fp = fopen(fname, "rb");
1891
0
    LEPT_FREE(fname);
1892
0
    if (fp) return fp;
1893
1894
        /* Else, strip directory and try locally */
1895
0
    splitPathAtDirectory(filename, NULL, &tail);
1896
0
    if (tail) {
1897
0
        fp = fopen(tail, "rb");
1898
0
        if (!fp)
1899
0
            L_INFO("failed to open locally with tail %s for filename %s\n",
1900
0
                 __func__, tail, filename);
1901
0
        LEPT_FREE(tail);
1902
0
        if (fp) return fp;
1903
0
    }
1904
1905
#ifdef WIN32
1906
        /* On Windows, if the file wasn't found, check if the name is
1907
           wrapped in double quotes and try again.  This supports
1908
           "Copy as path", which wraps paths in double quotes. */
1909
    if (len > 2 && filename[0] == '"' && filename[len - 1] == '"') {
1910
        stripped_name = (char *)LEPT_CALLOC(len, sizeof(char));
1911
        if (!stripped_name) {
1912
            L_ERROR("stripped name not alloc'd\n", __func__);
1913
        } else {
1914
            memcpy(stripped_name, filename + 1, len - 2);
1915
            fp = fopenReadStream(stripped_name);  /* recursive call */
1916
            LEPT_FREE(stripped_name);
1917
            if (fp) return fp;
1918
        }
1919
    }
1920
#endif  /* WIN32 */
1921
1922
0
    return (FILE *)ERROR_PTR_1("file not found", filename, __func__, NULL);
1923
0
}
1924
1925
1926
/*!
1927
 * \brief   fopenWriteStream()
1928
 *
1929
 * \param[in]    filename
1930
 * \param[in]    modestring
1931
 * \return  stream, or NULL on error
1932
 *
1933
 * <pre>
1934
 * Notes:
1935
 *      (1) This should be used whenever you want to run fopen() to
1936
 *          write or append to a stream.  Never call fopen() directory.
1937
 *      (2) This handles the temp directory pathname conversion where needed:
1938
 *              /tmp  ==>  [OS specific temp directory]
1939
 * </pre>
1940
 */
1941
FILE *
1942
fopenWriteStream(const char  *filename,
1943
                 const char  *modestring)
1944
0
{
1945
0
char  *fname;
1946
0
FILE  *fp;
1947
1948
0
    if (!filename)
1949
0
        return (FILE *)ERROR_PTR("filename not defined", __func__, NULL);
1950
1951
0
    fname = genPathname(filename, NULL);
1952
0
    fp = fopen(fname, modestring);
1953
0
    if (!fp)
1954
0
        fp = (FILE *)ERROR_PTR_1("stream not opened", fname, __func__, NULL);
1955
0
    LEPT_FREE(fname);
1956
0
    return fp;
1957
0
}
1958
1959
1960
/*!
1961
 * \brief   fopenReadFromMemory()
1962
 *
1963
 * \param[in]    data, size
1964
 * \return  file stream, or NULL on error
1965
 *
1966
 * <pre>
1967
 * Notes:
1968
 *      (1) Work-around if fmemopen() not available.
1969
 *      (2) Windows tmpfile() writes into the root C:\ directory, which
1970
 *          requires admin privileges.  This also works around that.
1971
 * </pre>
1972
 */
1973
FILE *
1974
fopenReadFromMemory(const l_uint8  *data,
1975
                    size_t          size)
1976
0
{
1977
0
FILE  *fp;
1978
1979
0
    if (!data)
1980
0
        return (FILE *)ERROR_PTR("data not defined", __func__, NULL);
1981
1982
0
#if HAVE_FMEMOPEN
1983
0
    if ((fp = fmemopen((void *)data, size, "rb")) == NULL)
1984
0
        return (FILE *)ERROR_PTR("stream not opened", __func__, NULL);
1985
#else  /* write to tmp file */
1986
    L_INFO("no fmemopen API --> work-around: write to temp file\n", __func__);
1987
  #ifdef _WIN32
1988
    if ((fp = fopenWriteWinTempfile()) == NULL)
1989
        return (FILE *)ERROR_PTR("tmpfile stream not opened", __func__, NULL);
1990
  #else
1991
    if ((fp = tmpfile()) == NULL)
1992
        return (FILE *)ERROR_PTR("tmpfile stream not opened", __func__, NULL);
1993
  #endif  /*  _WIN32 */
1994
    fwrite(data, 1, size, fp);
1995
    rewind(fp);
1996
#endif  /* HAVE_FMEMOPEN */
1997
1998
0
    return fp;
1999
0
}
2000
2001
2002
/*--------------------------------------------------------------------*
2003
 *                Opening a Windows tmpfile for writing               *
2004
 *--------------------------------------------------------------------*/
2005
/*!
2006
 * \brief   fopenWriteWinTempfile()
2007
 *
2008
 * \return  file stream, or NULL on error
2009
 *
2010
 * <pre>
2011
 * Notes:
2012
 *      (1) The Windows version of tmpfile() writes into the root
2013
 *          C:\ directory, which requires admin privileges.  This
2014
 *          function provides an alternative implementation.
2015
 * </pre>
2016
 */
2017
FILE *
2018
fopenWriteWinTempfile(void)
2019
0
{
2020
#ifdef _WIN32
2021
l_int32  handle;
2022
FILE    *fp;
2023
char    *filename;
2024
2025
    if ((filename = l_makeTempFilename()) == NULL) {
2026
        L_ERROR("l_makeTempFilename failed, %s\n", __func__, strerror(errno));
2027
        return NULL;
2028
    }
2029
2030
    handle = _open(filename, _O_CREAT | _O_RDWR | _O_SHORT_LIVED |
2031
                   _O_TEMPORARY | _O_BINARY, _S_IREAD | _S_IWRITE);
2032
    lept_free(filename);
2033
    if (handle == -1) {
2034
        L_ERROR("_open failed, %s\n", __func__, strerror(errno));
2035
        return NULL;
2036
    }
2037
2038
    if ((fp = _fdopen(handle, "r+b")) == NULL) {
2039
        L_ERROR("_fdopen failed, %s\n", __func__, strerror(errno));
2040
        return NULL;
2041
    }
2042
2043
    return fp;
2044
#else
2045
0
    return NULL;
2046
0
#endif  /*  _WIN32 */
2047
0
}
2048
2049
2050
/*--------------------------------------------------------------------*
2051
 *       Multi-platform functions that avoid C-runtime boundary       *
2052
 *             crossing for applications with Windows DLLs            *
2053
 *--------------------------------------------------------------------*/
2054
/*
2055
 *  Problems arise when pointers to streams and data are passed
2056
 *  between two Windows DLLs that have been generated with different
2057
 *  C runtimes.  To avoid this, leptonica provides wrappers for
2058
 *  several C library calls.
2059
 */
2060
/*!
2061
 * \brief   lept_fopen()
2062
 *
2063
 * \param[in]    filename
2064
 * \param[in]    mode       same as for fopen(); e.g., "rb"
2065
 * \return  stream or NULL on error
2066
 *
2067
 * <pre>
2068
 * Notes:
2069
 *      (1) This must be used by any application that passes
2070
 *          a file handle to a leptonica Windows DLL.
2071
 * </pre>
2072
 */
2073
FILE *
2074
lept_fopen(const char  *filename,
2075
           const char  *mode)
2076
0
{
2077
0
    if (!filename)
2078
0
        return (FILE *)ERROR_PTR("filename not defined", __func__, NULL);
2079
0
    if (!mode)
2080
0
        return (FILE *)ERROR_PTR("mode not defined", __func__, NULL);
2081
2082
0
    if (stringFindSubstr(mode, "r", NULL))
2083
0
        return fopenReadStream(filename);
2084
0
    else
2085
0
        return fopenWriteStream(filename, mode);
2086
0
}
2087
2088
2089
/*!
2090
 * \brief   lept_fclose()
2091
 *
2092
 * \param[in]    fp    file stream
2093
 * \return  0 if OK, 1 on error
2094
 *
2095
 * <pre>
2096
 * Notes:
2097
 *      (1) This should be used by any application that accepts
2098
 *          a file handle generated by a leptonica Windows DLL.
2099
 * </pre>
2100
 */
2101
l_ok
2102
lept_fclose(FILE *fp)
2103
0
{
2104
0
    if (!fp)
2105
0
        return ERROR_INT("stream not defined", __func__, 1);
2106
2107
0
    return fclose(fp);
2108
0
}
2109
2110
2111
/*!
2112
 * \brief   lept_calloc()
2113
 *
2114
 * \param[in]    nmemb    number of members
2115
 * \param[in]    size     of each member
2116
 * \return  void ptr, or NULL on error
2117
 *
2118
 * <pre>
2119
 * Notes:
2120
 *      (1) For safety with Windows DLLs, this can be used in conjunction
2121
 *          with lept_free() to avoid C-runtime boundary problems.
2122
 *          Just use these two functions throughout your application.
2123
 * </pre>
2124
 */
2125
void *
2126
lept_calloc(size_t  nmemb,
2127
            size_t  size)
2128
0
{
2129
0
    if (nmemb <= 0 || size <= 0)
2130
0
        return NULL;
2131
0
    return LEPT_CALLOC(nmemb, size);
2132
0
}
2133
2134
2135
/*!
2136
 * \brief   lept_free()
2137
 *
2138
 * \param[in]    ptr
2139
 *
2140
 * <pre>
2141
 * Notes:
2142
 *      (1) This should be used by any application that accepts
2143
 *          heap data allocated by a leptonica Windows DLL.
2144
 * </pre>
2145
 */
2146
void
2147
lept_free(void *ptr)
2148
135k
{
2149
135k
    if (!ptr) return;
2150
135k
    LEPT_FREE(ptr);
2151
135k
}
2152
2153
2154
/*--------------------------------------------------------------------*
2155
 *                Multi-platform file system operations               *
2156
 *         [ These only write to /tmp or its subdirectories ]         *
2157
 *--------------------------------------------------------------------*/
2158
/*!
2159
 * \brief   lept_mkdir()
2160
 *
2161
 * \param[in]    subdir    of /tmp or its OS specific equivalent
2162
 * \return  0 on success, non-zero on failure
2163
 *
2164
 * <pre>
2165
 * Notes:
2166
 *      (1) %subdir is a partial path that can consist of one or more
2167
 *          directories.
2168
 *      (2) This makes any subdirectories of /tmp that are required.
2169
 *      (3) The root temp directory is:
2170
 *            /tmp    (unix)  [default]
2171
 *            [Temp]  (Windows)
2172
 * </pre>
2173
 */
2174
l_int32
2175
lept_mkdir(const char  *subdir)
2176
0
{
2177
0
char     *dir, *tmpdir;
2178
0
l_int32   i, n;
2179
0
l_int32   ret = 0;
2180
0
SARRAY   *sa;
2181
#ifdef  _WIN32
2182
l_uint32  attributes;
2183
#endif  /* _WIN32 */
2184
2185
0
    if (!LeptDebugOK) {
2186
0
        L_INFO("making named temp subdirectory %s is disabled\n",
2187
0
               __func__, subdir);
2188
0
        return 0;
2189
0
    }
2190
2191
0
    if (!subdir)
2192
0
        return ERROR_INT("subdir not defined", __func__, 1);
2193
0
    if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/'))
2194
0
        return ERROR_INT("subdir not an actual subdirectory", __func__, 1);
2195
2196
0
    sa = sarrayCreate(0);
2197
0
    sarraySplitString(sa, subdir, "/");
2198
0
    n = sarrayGetCount(sa);
2199
0
    dir = genPathname("/tmp", NULL);
2200
0
    ret = 0;   /* don't check ret values with unix because if a directory
2201
                * exists, mkdir() returns -1  */
2202
       /* Make sure the tmp directory exists */
2203
0
#ifndef _WIN32
2204
0
    mkdir(dir, 0777);
2205
#else
2206
    attributes = GetFileAttributesA(dir);
2207
    if (attributes == INVALID_FILE_ATTRIBUTES)
2208
        ret = (CreateDirectoryA(dir, NULL) ? 0 : 1);
2209
#endif
2210
        /* Make all the subdirectories */
2211
0
    for (i = 0; i < n; i++) {
2212
0
        tmpdir = pathJoin(dir, sarrayGetString(sa, i, L_NOCOPY));
2213
0
#ifndef _WIN32
2214
0
        mkdir(tmpdir, 0777);
2215
#else
2216
        if (CreateDirectoryA(tmpdir, NULL) == 0)
2217
            ret += (GetLastError() != ERROR_ALREADY_EXISTS);
2218
#endif
2219
0
        LEPT_FREE(dir);
2220
0
        dir = tmpdir;
2221
0
    }
2222
0
    LEPT_FREE(dir);
2223
0
    sarrayDestroy(&sa);
2224
0
    if (ret > 0)
2225
0
        L_ERROR("failure to create %d directories\n", __func__, ret);
2226
0
    return ret;
2227
0
}
2228
2229
2230
/*!
2231
 * \brief   lept_rmdir()
2232
 *
2233
 * \param[in]    subdir    of /tmp or its OS specific equivalent
2234
 * \return  0 on success, non-zero on failure
2235
 *
2236
 * <pre>
2237
 * Notes:
2238
 *      (1) %subdir is a partial path that can consist of one or more
2239
 *          directories.
2240
 *      (2) This removes all files from the specified subdirectory of
2241
 *          the root temp directory:
2242
 *            /tmp    (unix)
2243
 *            [Temp]  (Windows)
2244
 *          and then removes the subdirectory.
2245
 *      (3) The combination
2246
 *            lept_rmdir(subdir);
2247
 *            lept_mkdir(subdir);
2248
 *          is guaranteed to give you an empty subdirectory.
2249
 * </pre>
2250
 */
2251
l_int32
2252
lept_rmdir(const char  *subdir)
2253
0
{
2254
0
char    *dir, *fname, *fullname;
2255
0
l_int32  exists, ret, i, nfiles;
2256
0
SARRAY  *sa;
2257
#ifdef _WIN32
2258
char    *newpath;
2259
#else
2260
0
char    *realdir;
2261
0
#endif  /* _WIN32 */
2262
2263
0
    if (!subdir)
2264
0
        return ERROR_INT("subdir not defined", __func__, 1);
2265
0
    if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/'))
2266
0
        return ERROR_INT("subdir not an actual subdirectory", __func__, 1);
2267
2268
        /* Find the temp subdirectory */
2269
0
    dir = pathJoin("/tmp", subdir);
2270
0
    if (!dir)
2271
0
        return ERROR_INT("directory name not made", __func__, 1);
2272
0
    lept_direxists(dir, &exists);
2273
0
    if (!exists) {  /* fail silently */
2274
0
        LEPT_FREE(dir);
2275
0
        return 0;
2276
0
    }
2277
2278
        /* List all the files in that directory */
2279
0
    if ((sa = getFilenamesInDirectory(dir)) == NULL) {
2280
0
        L_ERROR("directory %s does not exist!\n", __func__, dir);
2281
0
        LEPT_FREE(dir);
2282
0
        return 1;
2283
0
    }
2284
0
    nfiles = sarrayGetCount(sa);
2285
2286
0
    for (i = 0; i < nfiles; i++) {
2287
0
        fname = sarrayGetString(sa, i, L_NOCOPY);
2288
0
        fullname = genPathname(dir, fname);
2289
0
        remove(fullname);
2290
0
        LEPT_FREE(fullname);
2291
0
    }
2292
2293
0
#ifndef _WIN32
2294
0
    realdir = genPathname("/tmp", subdir);
2295
0
    ret = rmdir(realdir);
2296
0
    LEPT_FREE(realdir);
2297
#else
2298
    newpath = genPathname(dir, NULL);
2299
    ret = (RemoveDirectoryA(newpath) ? 0 : 1);
2300
    LEPT_FREE(newpath);
2301
#endif  /* !_WIN32 */
2302
2303
0
    sarrayDestroy(&sa);
2304
0
    LEPT_FREE(dir);
2305
0
    return ret;
2306
0
}
2307
2308
2309
/*!
2310
 * \brief   lept_direxists()
2311
 *
2312
 * \param[in]    dir
2313
 * \param[out]   pexists    1 if it exists; 0 otherwise
2314
 * \return  void
2315
 *
2316
 * <pre>
2317
 * Notes:
2318
 *      (1) Always use unix pathname separators.
2319
 *      (2) By calling genPathname(), if the pathname begins with "/tmp"
2320
 *          this does an automatic directory translation for operating
2321
 *          systems that use a different path for /tmp.
2322
 * </pre>
2323
 */
2324
void
2325
lept_direxists(const char  *dir,
2326
               l_int32     *pexists)
2327
0
{
2328
0
char  *realdir;
2329
2330
0
    if (!pexists) return;
2331
0
    *pexists = 0;
2332
0
    if (!dir) return;
2333
0
    if ((realdir = genPathname(dir, NULL)) == NULL)
2334
0
        return;
2335
2336
0
#ifndef _WIN32
2337
0
    {
2338
0
    struct stat s;
2339
0
    l_int32 err = stat(realdir, &s);
2340
0
    if (err != -1 && S_ISDIR(s.st_mode))
2341
0
        *pexists = 1;
2342
0
    }
2343
#else  /* _WIN32 */
2344
    {
2345
    l_uint32  attributes;
2346
    attributes = GetFileAttributesA(realdir);
2347
    if (attributes != INVALID_FILE_ATTRIBUTES &&
2348
        (attributes & FILE_ATTRIBUTE_DIRECTORY))
2349
        *pexists = 1;
2350
    }
2351
#endif  /* _WIN32 */
2352
2353
0
    LEPT_FREE(realdir);
2354
0
}
2355
2356
2357
/*!
2358
 * \brief   lept_rm_match()
2359
 *
2360
 * \param[in]    subdir    [optional] if NULL, the removed files are in /tmp
2361
 * \param[in]    substr    [optional] pattern to match in filename
2362
 * \return  0 on success, non-zero on failure
2363
 *
2364
 * <pre>
2365
 * Notes:
2366
 *      (1) This removes the matched files in /tmp or a subdirectory of /tmp.
2367
 *          Use NULL for %subdir if the files are in /tmp.
2368
 *      (2) If %substr == NULL, this removes all files in the directory.
2369
 *          If %substr == "" (empty), this removes no files.
2370
 *          If both %subdir == NULL and %substr == NULL, this removes
2371
 *          all files in /tmp.
2372
 *      (3) Use unix pathname separators.
2373
 *      (4) By calling genPathname(), if the pathname begins with "/tmp"
2374
 *          this does an automatic directory translation for operating
2375
 *          systems that use a different path for /tmp.
2376
 *      (5) Error conditions:
2377
 *            * returns -1 if the directory is not found
2378
 *            * returns the number of files (> 0) that it was unable to remove.
2379
 * </pre>
2380
 */
2381
l_int32
2382
lept_rm_match(const char  *subdir,
2383
              const char  *substr)
2384
0
{
2385
0
char    *path, *fname;
2386
0
char     tempdir[256];
2387
0
l_int32  i, n, ret;
2388
0
SARRAY  *sa;
2389
2390
0
    makeTempDirname(tempdir, sizeof(tempdir), subdir);
2391
0
    if ((sa = getSortedPathnamesInDirectory(tempdir, substr, 0, 0)) == NULL)
2392
0
        return ERROR_INT("sa not made", __func__, -1);
2393
0
    n = sarrayGetCount(sa);
2394
0
    if (n == 0) {
2395
0
        L_WARNING("no matching files found\n", __func__);
2396
0
        sarrayDestroy(&sa);
2397
0
        return 0;
2398
0
    }
2399
2400
0
    ret = 0;
2401
0
    for (i = 0; i < n; i++) {
2402
0
        fname = sarrayGetString(sa, i, L_NOCOPY);
2403
0
        path = genPathname(fname, NULL);
2404
0
        if (lept_rmfile(path) != 0) {
2405
0
            L_ERROR("failed to remove %s\n", __func__, path);
2406
0
            ret++;
2407
0
        }
2408
0
        LEPT_FREE(path);
2409
0
    }
2410
0
    sarrayDestroy(&sa);
2411
0
    return ret;
2412
0
}
2413
2414
2415
/*!
2416
 * \brief   lept_rm()
2417
 *
2418
 * \param[in]    subdir    [optional] subdir of '/tmp'; can be NULL
2419
 * \param[in]    tail      filename without the directory
2420
 * \return  0 on success, non-zero on failure
2421
 *
2422
 * <pre>
2423
 * Notes:
2424
 *      (1) By calling genPathname(), this does an automatic directory
2425
 *          translation on operating systems which use a different path.
2426
 * </pre>
2427
 */
2428
l_int32
2429
lept_rm(const char  *subdir,
2430
        const char  *tail)
2431
0
{
2432
0
char    *path;
2433
0
char     newtemp[256];
2434
0
l_int32  ret;
2435
2436
0
    if (!tail || strlen(tail) == 0)
2437
0
        return ERROR_INT("tail undefined or empty", __func__, 1);
2438
2439
0
    if (makeTempDirname(newtemp, sizeof(newtemp), subdir))
2440
0
        return ERROR_INT("temp dirname not made", __func__, 1);
2441
0
    path = genPathname(newtemp, tail);
2442
0
    ret = lept_rmfile(path);
2443
0
    LEPT_FREE(path);
2444
0
    return ret;
2445
0
}
2446
2447
2448
/*!
2449
 * \brief
2450
 *
2451
 *  lept_rmfile()
2452
 *
2453
 * \param[in]    filepath     full path to file including the directory
2454
 * \return  0 on success, non-zero on failure
2455
 *
2456
 * <pre>
2457
 * Notes:
2458
 *      (1) This removes the named file.
2459
 *      (2) Use unix pathname separators.
2460
 *      (3) There is no name translation.
2461
 *      (4) Unlike the other lept_* functions in this section, this can remove
2462
 *          any file -- it is not restricted to files that are in /tmp or a
2463
 *          subdirectory of it.
2464
 *      (5) For files in /tmp or a subdirectory of it, this does an automatic
2465
 *          directory translation for operating systems that use a different
2466
 *          path for /tmp.
2467
 * </pre>
2468
 */
2469
l_int32
2470
lept_rmfile(const char  *filepath)
2471
0
{
2472
0
l_int32  ret;
2473
2474
0
    if (!filepath || strlen(filepath) == 0)
2475
0
        return ERROR_INT("filepath undefined or empty", __func__, 1);
2476
2477
0
#ifndef _WIN32
2478
0
    ret = remove(filepath);
2479
#else
2480
        /* Set attributes to allow deletion of read-only files */
2481
    SetFileAttributesA(filepath, FILE_ATTRIBUTE_NORMAL);
2482
    ret = DeleteFileA(filepath) ? 0 : 1;
2483
#endif  /* !_WIN32 */
2484
2485
0
    return ret;
2486
0
}
2487
2488
2489
/*!
2490
 * \brief   lept_mv()
2491
 *
2492
 * \param[in]    srcfile
2493
 * \param[in]    newdir     [optional]; can be NULL
2494
 * \param[in]    newtail    [optional]; can be NULL
2495
 * \param[out]   pnewpath   [optional] of actual path; can be NULL
2496
 * \return  0 on success, non-zero on failure
2497
 *
2498
 * <pre>
2499
 * Notes:
2500
 *      (1) This moves %srcfile to /tmp or to a subdirectory of /tmp.
2501
 *      (2) %srcfile can either be a full path or relative to the
2502
 *          current directory.
2503
 *      (3) %newdir can either specify an existing subdirectory of /tmp
2504
 *          or can be NULL.  In the latter case, the file will be written
2505
 *          into /tmp.
2506
 *      (4) %newtail can either specify a filename tail or, if NULL,
2507
 *          the filename is taken from src-tail, the tail of %srcfile.
2508
 *      (5) For debugging, the computed newpath can be returned.  It must
2509
 *          be freed by the caller.
2510
 *      (6) Reminders:
2511
 *          (a) specify files using unix pathnames
2512
 *          (b) this does an automatic directory translation on operating
2513
 *              systems that use a different path for /tmp.
2514
 *      (7) Examples:
2515
 *          * newdir = NULL,    newtail = NULL    ==> /tmp/src-tail
2516
 *          * newdir = NULL,    newtail = abc     ==> /tmp/abc
2517
 *          * newdir = def/ghi, newtail = NULL    ==> /tmp/def/ghi/src-tail
2518
 *          * newdir = def/ghi, newtail = abc     ==> /tmp/def/ghi/abc
2519
 * </pre>
2520
 */
2521
l_int32
2522
lept_mv(const char  *srcfile,
2523
        const char  *newdir,
2524
        const char  *newtail,
2525
        char       **pnewpath)
2526
0
{
2527
0
char    *srcpath, *newpath, *dir, *srctail;
2528
0
char     newtemp[256];
2529
0
l_int32  ret;
2530
2531
0
    if (!srcfile)
2532
0
        return ERROR_INT("srcfile not defined", __func__, 1);
2533
2534
        /* Require output pathname to be in /tmp/ or a subdirectory */
2535
0
    if (makeTempDirname(newtemp, sizeof(newtemp), newdir) == 1)
2536
0
        return ERROR_INT("newdir not NULL or a subdir of /tmp", __func__, 1);
2537
2538
        /* Get canonical src pathname */
2539
0
    splitPathAtDirectory(srcfile, &dir, &srctail);
2540
2541
0
#ifndef _WIN32
2542
0
    srcpath = pathJoin(dir, srctail);
2543
0
    LEPT_FREE(dir);
2544
2545
        /* Generate output pathname */
2546
0
    if (!newtail || newtail[0] == '\0')
2547
0
        newpath = pathJoin(newtemp, srctail);
2548
0
    else
2549
0
        newpath = pathJoin(newtemp, newtail);
2550
0
    LEPT_FREE(srctail);
2551
2552
        /* Overwrite any existing file at 'newpath' */
2553
0
    ret = fileCopy(srcpath, newpath);
2554
0
    if (!ret) {  /* and remove srcfile */
2555
0
        char *realpath = genPathname(srcpath, NULL);
2556
0
        remove(realpath);
2557
0
        LEPT_FREE(realpath);
2558
0
    }
2559
#else
2560
    srcpath = genPathname(dir, srctail);
2561
    LEPT_FREE(dir);
2562
2563
        /* Generate output pathname */
2564
    if (!newtail || newtail[0] == '\0')
2565
        newpath = genPathname(newtemp, srctail);
2566
    else
2567
        newpath = genPathname(newtemp, newtail);
2568
    LEPT_FREE(srctail);
2569
2570
        /* Overwrite any existing file at 'newpath' */
2571
    ret = MoveFileExA(srcpath, newpath,
2572
                     MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING) ? 0 : 1;
2573
#endif  /* ! _WIN32 */
2574
2575
0
    LEPT_FREE(srcpath);
2576
0
    if (pnewpath)
2577
0
        *pnewpath = newpath;
2578
0
    else
2579
0
        LEPT_FREE(newpath);
2580
0
    return ret;
2581
0
}
2582
2583
2584
/*!
2585
 * \brief   lept_cp()
2586
 *
2587
 * \param[in]    srcfile
2588
 * \param[in]    newdir    [optional]; can be NULL
2589
 * \param[in]    newtail   [optional]; can be NULL
2590
 * \param[out]   pnewpath  [optional] of actual path; can be NULL
2591
 * \return  0 on success, non-zero on failure
2592
 *
2593
 * <pre>
2594
 * Notes:
2595
 *      (1) This copies %srcfile to /tmp or to a subdirectory of /tmp.
2596
 *      (2) %srcfile can either be a full path or relative to the
2597
 *          current directory.
2598
 *      (3) %newdir can either specify an existing subdirectory of /tmp,
2599
 *          or can be NULL.  In the latter case, the file will be written
2600
 *          into /tmp.
2601
 *      (4) %newtail can either specify a filename tail or, if NULL,
2602
 *          the filename is taken from src-tail, the tail of %srcfile.
2603
 *      (5) For debugging, the computed newpath can be returned.  It must
2604
 *          be freed by the caller.
2605
 *      (6) Reminders:
2606
 *          (a) specify files using unix pathnames
2607
 *          (b) this does an automatic directory translation for operating
2608
 *              systems that use a different path for /tmp
2609
 *      (7) Examples:
2610
 *          * newdir = NULL,    newtail = NULL    ==> /tmp/src-tail
2611
 *          * newdir = NULL,    newtail = abc     ==> /tmp/abc
2612
 *          * newdir = def/ghi, newtail = NULL    ==> /tmp/def/ghi/src-tail
2613
 *          * newdir = def/ghi, newtail = abc     ==> /tmp/def/ghi/abc
2614
 *
2615
 * </pre>
2616
 */
2617
l_int32
2618
lept_cp(const char  *srcfile,
2619
        const char  *newdir,
2620
        const char  *newtail,
2621
        char       **pnewpath)
2622
0
{
2623
0
char    *srcpath, *newpath, *dir, *srctail;
2624
0
char     newtemp[256];
2625
0
l_int32  ret;
2626
2627
0
    if (!srcfile)
2628
0
        return ERROR_INT("srcfile not defined", __func__, 1);
2629
2630
        /* Require output pathname to be in /tmp or a subdirectory */
2631
0
    if (makeTempDirname(newtemp, sizeof(newtemp), newdir) == 1)
2632
0
        return ERROR_INT("newdir not NULL or a subdir of /tmp", __func__, 1);
2633
2634
       /* Get canonical src pathname */
2635
0
    splitPathAtDirectory(srcfile, &dir, &srctail);
2636
2637
0
#ifndef _WIN32
2638
0
    srcpath = pathJoin(dir, srctail);
2639
0
    LEPT_FREE(dir);
2640
2641
        /* Generate output pathname */
2642
0
    if (!newtail || newtail[0] == '\0')
2643
0
        newpath = pathJoin(newtemp, srctail);
2644
0
    else
2645
0
        newpath = pathJoin(newtemp, newtail);
2646
0
    LEPT_FREE(srctail);
2647
2648
        /* Overwrite any existing file at 'newpath' */
2649
0
    ret = fileCopy(srcpath, newpath);
2650
#else
2651
    srcpath = genPathname(dir, srctail);
2652
    LEPT_FREE(dir);
2653
2654
        /* Generate output pathname */
2655
    if (!newtail || newtail[0] == '\0')
2656
        newpath = genPathname(newtemp, srctail);
2657
    else
2658
        newpath = genPathname(newtemp, newtail);
2659
    LEPT_FREE(srctail);
2660
2661
        /* Overwrite any existing file at 'newpath' */
2662
    ret = CopyFileA(srcpath, newpath, FALSE) ? 0 : 1;
2663
#endif   /* !_WIN32 */
2664
2665
0
    LEPT_FREE(srcpath);
2666
0
    if (pnewpath)
2667
0
        *pnewpath = newpath;
2668
0
    else
2669
0
        LEPT_FREE(newpath);
2670
0
    return ret;
2671
0
}
2672
2673
2674
/*--------------------------------------------------------------------*
2675
 *          Special debug/test function for calling 'system'          *
2676
 *--------------------------------------------------------------------*/
2677
#if defined(__APPLE__)
2678
  #include "TargetConditionals.h"
2679
#endif  /* __APPLE__ */
2680
2681
/*!
2682
 * \brief   callSystemDebug()
2683
 *
2684
 * \param[in]    cmd      command to be exec'd
2685
 * \return  0 on success
2686
 *
2687
 * <pre>
2688
 * Notes:
2689
 *      (1) The C library 'system' call is only made through this function.
2690
 *          It only works in debug/test mode, where the global variable
2691
 *          LeptDebugOK == TRUE.  This variable is set to FALSE in the
2692
 *          library as distributed, and calling this function will
2693
 *          generate an error message.
2694
 * </pre>
2695
 */
2696
l_int32
2697
callSystemDebug(const char *cmd)
2698
0
{
2699
0
l_int32  ret;
2700
2701
0
    if (!cmd) {
2702
0
        L_ERROR("cmd not defined\n", __func__);
2703
0
        return 1;
2704
0
    }
2705
0
    if (LeptDebugOK == FALSE) {
2706
0
        L_INFO("'system' calls are disabled\n", __func__);
2707
0
        return 1;
2708
0
    }
2709
2710
#if defined(__APPLE__)  /* iOS 11 does not support system() */
2711
2712
  #if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1)  /* Mac OS X */
2713
    ret = system(cmd);
2714
  #elif TARGET_OS_IPHONE || defined(OS_IOS)  /* iOS */
2715
    L_ERROR("iOS 11 does not support system()\n", __func__);
2716
  #endif  /* TARGET_OS_OSX */
2717
2718
#else /* ! __APPLE__ */
2719
2720
0
   ret = system(cmd);
2721
2722
0
#endif /* __APPLE__ */
2723
2724
0
   return ret;
2725
0
}
2726
2727
2728
/*--------------------------------------------------------------------*
2729
 *                     General file name operations                   *
2730
 *--------------------------------------------------------------------*/
2731
/*!
2732
 * \brief   splitPathAtDirectory()
2733
 *
2734
 * \param[in]    pathname  full path; can be a directory
2735
 * \param[out]   pdir      [optional] root directory name of
2736
 *                         input path, including trailing '/'
2737
 * \param[out]   ptail     [optional] path tail, which is either
2738
 *                         the file name within the root directory or
2739
 *                         the last sub-directory in the path
2740
 * \return  0 if OK, 1 on error
2741
 *
2742
 * <pre>
2743
 * Notes:
2744
 *      (1) If you only want the tail, input null for the root directory ptr.
2745
 *      (2) If you only want the root directory name, input null for the
2746
 *          tail ptr.
2747
 *      (3) This function makes decisions based only on the lexical
2748
 *          structure of the input.  Examples:
2749
 *            /usr/tmp/abc.d  -->  dir: /usr/tmp/       tail: abc.d
2750
 *            /usr/tmp/       -->  dir: /usr/tmp/       tail: [empty string]
2751
 *            /usr/tmp        -->  dir: /usr/           tail: tmp
2752
 *            abc.d           -->  dir: [empty string]  tail: abc.d
2753
 *      (4  Consider the first example above: /usr/tmp/abc.d.
2754
 *          Suppose you want the stem of the file, abc, without either
2755
 *          the directory or the extension.  This can be extracted in two steps:
2756
 *              splitPathAtDirectory("usr/tmp/abc.d", NULL, &tail);
2757
 *                   [sets tail: "abc.d"]
2758
 *              splitPathAtExtension(tail, &basename, NULL);
2759
 *                   [sets basename: "abc"]
2760
 *      (5) The input can have either forward (unix) or backward (win)
2761
 *          slash separators.  The output has unix separators.
2762
 *          Note that Win32 pathname functions generally accept both
2763
 *          slash forms, but the Windows command line interpreter
2764
 *          only accepts backward slashes, because forward slashes are
2765
 *          used to demarcate switches (vs. dashes in unix).
2766
 * </pre>
2767
 */
2768
l_ok
2769
splitPathAtDirectory(const char  *pathname,
2770
                     char       **pdir,
2771
                     char       **ptail)
2772
0
{
2773
0
char  *cpathname, *lastslash;
2774
2775
0
    if (!pdir && !ptail)
2776
0
        return ERROR_INT("null input for both strings", __func__, 1);
2777
0
    if (pdir) *pdir = NULL;
2778
0
    if (ptail) *ptail = NULL;
2779
0
    if (!pathname)
2780
0
        return ERROR_INT("pathname not defined", __func__, 1);
2781
2782
0
    cpathname = stringNew(pathname);
2783
0
    convertSepCharsInPath(cpathname, UNIX_PATH_SEPCHAR);
2784
0
    lastslash = strrchr(cpathname, '/');
2785
0
    if (lastslash) {
2786
0
        if (ptail)
2787
0
            *ptail = stringNew(lastslash + 1);
2788
0
        if (pdir) {
2789
0
            *(lastslash + 1) = '\0';
2790
0
            *pdir = cpathname;
2791
0
        } else {
2792
0
            LEPT_FREE(cpathname);
2793
0
        }
2794
0
    } else {  /* no directory */
2795
0
        if (pdir)
2796
0
            *pdir = stringNew("");
2797
0
        if (ptail)
2798
0
            *ptail = cpathname;
2799
0
        else
2800
0
            LEPT_FREE(cpathname);
2801
0
    }
2802
2803
0
    return 0;
2804
0
}
2805
2806
2807
/*!
2808
 * \brief   splitPathAtExtension()
2809
 *
2810
 * \param[in]    pathname    full path; can be a directory
2811
 * \param[out]   pbasename   [optional] pathname not including the
2812
 *                           last dot and characters after that
2813
 * \param[out]   pextension  [optional] path extension, which is
2814
 *                           the last dot and the characters after it.  If
2815
 *                           there is no extension, it returns the empty string
2816
 * \return  0 if OK, 1 on error
2817
 *
2818
 * <pre>
2819
 * Notes:
2820
 *      (1) If you only want the extension, input null for the basename ptr.
2821
 *      (2) If you only want the basename without extension, input null
2822
 *          for the extension ptr.
2823
 *      (3) This function makes decisions based only on the lexical
2824
 *          structure of the input.  Examples:
2825
 *            /usr/tmp/abc.jpg  -->  basename: /usr/tmp/abc    ext: .jpg
2826
 *            /usr/tmp/.jpg     -->  basename: /usr/tmp/       ext: .jpg
2827
 *            /usr/tmp.jpg/     -->  basename: /usr/tmp.jpg/   ext: [empty str]
2828
 *            ./.jpg            -->  basename: ./              ext: .jpg
2829
 *      (4) The input can have either forward (unix) or backward (win)
2830
 *          slash separators.  The output has unix separators.
2831
 *      (5) Note that basename, as used here, is different from the result
2832
 *          of the unix program 'basename'.  Here, basename is the entire
2833
 *          pathname up to a final extension and its preceding dot.
2834
 * </pre>
2835
 */
2836
l_ok
2837
splitPathAtExtension(const char  *pathname,
2838
                     char       **pbasename,
2839
                     char       **pextension)
2840
0
{
2841
0
char  *tail, *dir, *lastdot;
2842
0
char   empty[4] = "";
2843
2844
0
    if (!pbasename && !pextension)
2845
0
        return ERROR_INT("null input for both strings", __func__, 1);
2846
0
    if (pbasename) *pbasename = NULL;
2847
0
    if (pextension) *pextension = NULL;
2848
0
    if (!pathname)
2849
0
        return ERROR_INT("pathname not defined", __func__, 1);
2850
2851
        /* Split out the directory first */
2852
0
    splitPathAtDirectory(pathname, &dir, &tail);
2853
2854
        /* Then look for a "." in the tail part.
2855
         * This way we ignore all "." in the directory. */
2856
0
    if ((lastdot = strrchr(tail, '.'))) {
2857
0
        if (pextension)
2858
0
            *pextension = stringNew(lastdot);
2859
0
        if (pbasename) {
2860
0
            *lastdot = '\0';
2861
0
            *pbasename = stringJoin(dir, tail);
2862
0
        }
2863
0
    } else {
2864
0
        if (pextension)
2865
0
            *pextension = stringNew(empty);
2866
0
        if (pbasename)
2867
0
            *pbasename = stringNew(pathname);
2868
0
    }
2869
0
    LEPT_FREE(dir);
2870
0
    LEPT_FREE(tail);
2871
0
    return 0;
2872
0
}
2873
2874
2875
/*!
2876
 * \brief   pathJoin()
2877
 *
2878
 * \param[in]    dir     [optional] can be null
2879
 * \param[in]    fname   [optional] can be null
2880
 * \return  specially concatenated path, or NULL on error
2881
 *
2882
 * <pre>
2883
 * Notes:
2884
 *      (1) Use unix-style pathname separators ('/').
2885
 *      (2) %fname can be the entire path, or part of the path containing
2886
 *          at least one directory, or a tail without a directory, or NULL.
2887
 *      (3) It produces a path that strips multiple slashes to a single
2888
 *          slash, joins %dir and %fname by a slash, and has no trailing
2889
 *          slashes (except in the cases where %dir == "/" and
2890
 *          %fname == NULL, or v.v.).
2891
 *      (4) If both %dir and %fname are null, produces an empty string.
2892
 *      (5) Neither %dir nor %fname can begin with '..'.
2893
 *      (6) The result is not canonicalized or tested for correctness:
2894
 *          garbage in (e.g., /&%), garbage out.
2895
 *      (7) Examples:
2896
 *             //tmp// + //abc/  -->  /tmp/abc
2897
 *             tmp/ + /abc/      -->  tmp/abc
2898
 *             tmp/ + abc/       -->  tmp/abc
2899
 *             /tmp/ + ///       -->  /tmp
2900
 *             /tmp/ + NULL      -->  /tmp
2901
 *             // + /abc//       -->  /abc
2902
 *             // + NULL         -->  /
2903
 *             NULL + /abc/def/  -->  /abc/def
2904
 *             NULL + abc//      -->  abc
2905
 *             NULL + //         -->  /
2906
 *             NULL + NULL       -->  (empty string)
2907
 *             "" + ""           -->  (empty string)
2908
 *             "" + /            -->  /
2909
 *             ".." + /etc/foo   -->  NULL
2910
 *             /tmp + ".."       -->  NULL
2911
 * </pre>
2912
 */
2913
char *
2914
pathJoin(const char  *dir,
2915
         const char  *fname)
2916
0
{
2917
0
const char *slash = "/";
2918
0
char       *str, *dest;
2919
0
l_int32     i, n1, n2, emptydir;
2920
0
size_t      size;
2921
0
SARRAY     *sa1, *sa2;
2922
0
L_BYTEA    *ba;
2923
2924
0
    if (!dir && !fname)
2925
0
        return stringNew("");
2926
0
    if (dir && strlen(dir) >= 2 && dir[0] == '.' && dir[1] == '.')
2927
0
        return (char *)ERROR_PTR("dir starts with '..'", __func__, NULL);
2928
0
    if (fname && strlen(fname) >= 2 && fname[0] == '.' && fname[1] == '.')
2929
0
        return (char *)ERROR_PTR("fname starts with '..'", __func__, NULL);
2930
2931
0
    sa1 = sarrayCreate(0);
2932
0
    sa2 = sarrayCreate(0);
2933
0
    ba = l_byteaCreate(4);
2934
2935
        /* Process %dir */
2936
0
    if (dir && strlen(dir) > 0) {
2937
0
        if (dir[0] == '/')
2938
0
            l_byteaAppendString(ba, slash);
2939
0
        sarraySplitString(sa1, dir, "/");  /* removes all slashes */
2940
0
        n1 = sarrayGetCount(sa1);
2941
0
        for (i = 0; i < n1; i++) {
2942
0
            str = sarrayGetString(sa1, i, L_NOCOPY);
2943
0
            l_byteaAppendString(ba, str);
2944
0
            l_byteaAppendString(ba, slash);
2945
0
        }
2946
0
    }
2947
2948
        /* Special case to add leading slash: dir NULL or empty string  */
2949
0
    emptydir = dir && strlen(dir) == 0;
2950
0
    if ((!dir || emptydir) && fname && strlen(fname) > 0 && fname[0] == '/')
2951
0
        l_byteaAppendString(ba, slash);
2952
2953
        /* Process %fname */
2954
0
    if (fname && strlen(fname) > 0) {
2955
0
        sarraySplitString(sa2, fname, "/");
2956
0
        n2 = sarrayGetCount(sa2);
2957
0
        for (i = 0; i < n2; i++) {
2958
0
            str = sarrayGetString(sa2, i, L_NOCOPY);
2959
0
            l_byteaAppendString(ba, str);
2960
0
            l_byteaAppendString(ba, slash);
2961
0
        }
2962
0
    }
2963
2964
        /* Remove trailing slash */
2965
0
    dest = (char *)l_byteaCopyData(ba, &size);
2966
0
    if (size > 1 && dest[size - 1] == '/')
2967
0
        dest[size - 1] = '\0';
2968
2969
0
    sarrayDestroy(&sa1);
2970
0
    sarrayDestroy(&sa2);
2971
0
    l_byteaDestroy(&ba);
2972
0
    return dest;
2973
0
}
2974
2975
2976
/*!
2977
 * \brief   appendSubdirs()
2978
 *
2979
 * \param[in]    basedir
2980
 * \param[in]    subdirs
2981
 * \return  concatenated full directory path without trailing slash,
2982
 *              or NULL on error
2983
 *
2984
 * <pre>
2985
 * Notes:
2986
 *      (1) Use unix pathname separators
2987
 *      (2) Allocates a new string:  [basedir]/[subdirs]
2988
 * </pre>
2989
 */
2990
char *
2991
appendSubdirs(const char  *basedir,
2992
              const char  *subdirs)
2993
0
{
2994
0
char   *newdir;
2995
0
size_t  len1, len2, len3, len4;
2996
2997
0
    if (!basedir || !subdirs)
2998
0
        return (char *)ERROR_PTR("basedir and subdirs not both defined",
2999
0
                                 __func__, NULL);
3000
3001
0
    len1 = strlen(basedir);
3002
0
    len2 = strlen(subdirs);
3003
0
    len3 = len1 + len2 + 8;
3004
0
    if ((newdir = (char *)LEPT_CALLOC(len3, 1)) == NULL)
3005
0
        return (char *)ERROR_PTR("newdir not made", __func__, NULL);
3006
0
    stringCat(newdir, len3, basedir);
3007
0
    if (newdir[len1 - 1] != '/')  /* add '/' if necessary */
3008
0
        newdir[len1] = '/';
3009
0
    if (subdirs[0] == '/')  /* add subdirs, stripping leading '/' */
3010
0
        stringCat(newdir, len3, subdirs + 1);
3011
0
    else
3012
0
        stringCat(newdir, len3, subdirs);
3013
0
    len4 = strlen(newdir);
3014
0
    if (newdir[len4 - 1] == '/')  /* strip trailing '/' */
3015
0
        newdir[len4 - 1] = '\0';
3016
3017
0
    return newdir;
3018
0
}
3019
3020
3021
/*--------------------------------------------------------------------*
3022
 *                     Special file name operations                   *
3023
 *--------------------------------------------------------------------*/
3024
/*!
3025
 * \brief   convertSepCharsInPath()
3026
 *
3027
 * \param[in]    path
3028
 * \param[in]    type    UNIX_PATH_SEPCHAR, WIN_PATH_SEPCHAR
3029
 * \return  0 if OK, 1 on error
3030
 *
3031
 * <pre>
3032
 * Notes:
3033
 *      (1) In-place conversion.
3034
 *      (2) Type is the resulting type:
3035
 *            * UNIX_PATH_SEPCHAR:  '\\' ==> '/'
3036
 *            * WIN_PATH_SEPCHAR:   '/' ==> '\\'
3037
 *      (3) Virtually all path operations in leptonica use unix separators.
3038
 *      (4) The backslash is a valid character in unix pathnames and should
3039
 *          not be converted.  Each backslash needs to be escaped with a
3040
 *          preceding backslash for the shell, but the actual filename
3041
 *          does not include these escape characters.
3042
 * </pre>
3043
 */
3044
l_ok
3045
convertSepCharsInPath(char    *path,
3046
                      l_int32  type)
3047
0
{
3048
0
l_int32  i;
3049
0
size_t   len;
3050
3051
0
    if (!path)
3052
0
        return ERROR_INT("path not defined", __func__, 1);
3053
0
    if (type != UNIX_PATH_SEPCHAR && type != WIN_PATH_SEPCHAR)
3054
0
        return ERROR_INT("invalid type", __func__, 1);
3055
3056
0
    len = strlen(path);
3057
0
    if (type == UNIX_PATH_SEPCHAR) {
3058
#ifdef _WIN32  /* only convert on Windows */
3059
        for (i = 0; i < len; i++) {
3060
            if (path[i] == '\\')
3061
                path[i] = '/';
3062
        }
3063
#endif  /* _WIN32 */
3064
0
    } else {  /* WIN_PATH_SEPCHAR */
3065
0
        for (i = 0; i < len; i++) {
3066
0
            if (path[i] == '/')
3067
0
                path[i] = '\\';
3068
0
        }
3069
0
    }
3070
0
    return 0;
3071
0
}
3072
3073
3074
/*!
3075
 * \brief   genPathname()
3076
 *
3077
 * \param[in]    dir     [optional] directory or full path name,
3078
 *                       with or without the trailing '/'
3079
 * \param[in]    fname   [optional] file name within a directory
3080
 * \return  pathname either a directory or full path, or NULL on error
3081
 *
3082
 * <pre>
3083
 * Notes:
3084
 *      (1) This function generates actual paths in the following ways:
3085
 *            * from two sub-parts (e.g., a directory and a file name).
3086
 *            * from a single path full path, placed in %dir, with
3087
 *              %fname == NULL.
3088
 *            * from the name of a file in the local directory placed in
3089
 *              %fname, with %dir == NULL.
3090
 *            * if in a "/tmp" directory and on iOS, macOS or Windows,
3091
 *              the OS specific temp directory is used.
3092
 *      (2) This does an automatic directory translation for operating
3093
 *          systems that use a different path for /tmp.
3094
 *          That path is determined
3095
 *             * on Windows: by GetTempPath()
3096
 *             * on macOS, iOS: by confstr() (see man page)
3097
 *      (3) On unix, the TMPDIR variable is ignored.  No rewriting
3098
 *          of temp directories is permitted.
3099
 *      (4) There are four cases for the input:
3100
 *          (a) %dir is a directory and %fname is defined: result is a
3101
 *              full path
3102
 *          (b) %dir is a directory and %fname is null: result is a directory
3103
 *          (c) %dir is a full path and %fname is null: result is a full path
3104
 *          (d) %dir is null or an empty string: start in the current dir;
3105
 *              result is a full path
3106
 *      (5) In all cases, the resulting pathname is not terminated with a slash
3107
 *      (6) The caller is responsible for freeing the returned pathname.
3108
 * </pre>
3109
 */
3110
char *
3111
genPathname(const char  *dir,
3112
            const char  *fname)
3113
0
{
3114
#if defined(REWRITE_TMP)
3115
l_int32  rewrite_tmp = TRUE;
3116
#else
3117
0
l_int32  rewrite_tmp = FALSE;
3118
0
#endif  /* REWRITE_TMP */
3119
0
char    *cdir, *pathout;
3120
0
l_int32  dirlen, namelen;
3121
0
size_t   size;
3122
3123
0
    if (!dir && !fname)
3124
0
        return (char *)ERROR_PTR("no input", __func__, NULL);
3125
3126
        /* Handle the case where we start from the current directory */
3127
0
    if (!dir || dir[0] == '\0') {
3128
0
        if ((cdir = getcwd(NULL, 0)) == NULL)
3129
0
            return (char *)ERROR_PTR("no current dir found", __func__, NULL);
3130
0
    } else {
3131
0
        if ((cdir = stringNew(dir)) == NULL)
3132
0
            return (char *)ERROR_PTR("stringNew failed", __func__, NULL);
3133
0
    }
3134
3135
        /* Convert to unix path separators, and remove the trailing
3136
         * slash in the directory, except when dir == "/"  */
3137
0
    convertSepCharsInPath(cdir, UNIX_PATH_SEPCHAR);
3138
0
    dirlen = strlen(cdir);
3139
0
    if (cdir[dirlen - 1] == '/' && dirlen != 1) {
3140
0
        cdir[dirlen - 1] = '\0';
3141
0
        dirlen--;
3142
0
    }
3143
3144
0
    namelen = (fname) ? strlen(fname) : 0;
3145
0
    size = dirlen + namelen + 256;
3146
0
    if ((pathout = (char *)LEPT_CALLOC(size, sizeof(char))) == NULL) {
3147
0
        LEPT_FREE(cdir);
3148
0
        return (char *)ERROR_PTR("pathout not made", __func__, NULL);
3149
0
    }
3150
3151
        /* First handle %dir (which may be a full pathname).
3152
         * There is no path rewriting on unix, and on win32, we do not
3153
         * rewrite unless the specified directory is /tmp or
3154
         * a subdirectory of /tmp */
3155
0
    if (!rewrite_tmp || dirlen < 4 ||
3156
0
        (dirlen == 4 && strncmp(cdir, "/tmp", 4) != 0) ||  /* not in "/tmp" */
3157
0
        (dirlen > 4 && strncmp(cdir, "/tmp/", 5) != 0)) {  /* not in "/tmp/" */
3158
0
        stringCopy(pathout, cdir, dirlen);
3159
0
    } else {  /* Rewrite with "/tmp" specified for the directory. */
3160
#if defined(__APPLE__)
3161
        size_t n = confstr(_CS_DARWIN_USER_TEMP_DIR, pathout, size);
3162
        if (n == 0 || n > size) {
3163
            /* Fall back to using /tmp */
3164
            stringCopy(pathout, cdir, dirlen);
3165
        } else {
3166
            /* Add the rest of cdir */
3167
            if (dirlen > 4)
3168
                stringCat(pathout, size, cdir + 4);
3169
        }
3170
#elif defined(_WIN32)
3171
        l_int32 tmpdirlen;
3172
        char tmpdir[MAX_PATH];
3173
        GetTempPathA(sizeof(tmpdir), tmpdir);  /* get the Windows temp dir */
3174
        tmpdirlen = strlen(tmpdir);
3175
        if (tmpdirlen > 0 && tmpdir[tmpdirlen - 1] == '\\') {
3176
            tmpdir[tmpdirlen - 1] = '\0';  /* trim the trailing '\' */
3177
        }
3178
        tmpdirlen = strlen(tmpdir);
3179
        stringCopy(pathout, tmpdir, tmpdirlen);
3180
3181
            /* Add the rest of cdir */
3182
        if (dirlen > 4)
3183
            stringCat(pathout, size, cdir + 4);
3184
#endif  /* _WIN32 */
3185
0
    }
3186
3187
        /* Now handle %fname */
3188
0
    if (fname && strlen(fname) > 0) {
3189
0
        dirlen = strlen(pathout);
3190
0
        pathout[dirlen] = '/';
3191
0
        stringCat(pathout, size, fname);
3192
0
    }
3193
3194
0
    LEPT_FREE(cdir);
3195
0
    return pathout;
3196
0
}
3197
3198
3199
/*!
3200
 * \brief   makeTempDirname()
3201
 *
3202
 * \param[in]    result    preallocated on stack or heap and passed in
3203
 * \param[in]    nbytes    size of %result array, in bytes
3204
 * \param[in]    subdir    [optional]; can be NULL or an empty string
3205
 * \return  0 if OK, 1 on error
3206
 *
3207
 * <pre>
3208
 * Notes:
3209
 *      (1) This generates the directory path for output temp files,
3210
 *          written into %result with unix separators.
3211
 *      (2) Caller allocates %result, large enough to hold the path,
3212
 *          which is:
3213
 *            /tmp/%subdir       (unix)
3214
 *            [Temp]/%subdir     (Windows, macOS, iOS)
3215
 *          where [Temp] is the OS path
3216
 *          and %subdir is in general a set of nested subdirectories:
3217
 *            dir1/dir2/.../dirN
3218
 *          which in use would not typically exceed 2 levels.
3219
 *      (3) Usage example:
3220
 * \code
3221
 *           char  result[256];
3222
 *           makeTempDirname(result, sizeof(result), "lept/golden");
3223
 * \endcode
3224
 * </pre>
3225
 */
3226
l_ok
3227
makeTempDirname(char        *result,
3228
                size_t       nbytes,
3229
                const char  *subdir)
3230
0
{
3231
0
char    *dir, *path;
3232
0
l_int32  ret = 0;
3233
0
size_t   pathlen;
3234
3235
0
    if (!result)
3236
0
        return ERROR_INT("result not defined", __func__, 1);
3237
0
    if (subdir && ((subdir[0] == '.') || (subdir[0] == '/')))
3238
0
        return ERROR_INT("subdir not an actual subdirectory", __func__, 1);
3239
3240
0
    memset(result, 0, nbytes);
3241
3242
0
    dir = pathJoin("/tmp", subdir);
3243
3244
#if defined(REWRITE_TMP)
3245
    path = genPathname(dir, NULL);
3246
#else
3247
0
    path = stringNew(dir);
3248
0
#endif  /*  ~ _WIN32 */
3249
0
    pathlen = strlen(path);
3250
0
    if (pathlen < nbytes - 1) {
3251
0
        stringCopy(result, path, nbytes);
3252
0
    } else {
3253
0
        L_ERROR("result array too small for path\n", __func__);
3254
0
        ret = 1;
3255
0
    }
3256
3257
0
    LEPT_FREE(dir);
3258
0
    LEPT_FREE(path);
3259
0
    return ret;
3260
0
}
3261
3262
3263
/*!
3264
 * \brief   modifyTrailingSlash()
3265
 *
3266
 * \param[in]    path     preallocated on stack or heap and passed in
3267
 * \param[in]    nbytes   size of %path array, in bytes
3268
 * \param[in]    flag     L_ADD_TRAIL_SLASH or L_REMOVE_TRAIL_SLASH
3269
 * \return  0 if OK, 1 on error
3270
 *
3271
 * <pre>
3272
 * Notes:
3273
 *      (1) This carries out the requested action if necessary.
3274
 * </pre>
3275
 */
3276
l_ok
3277
modifyTrailingSlash(char    *path,
3278
                    size_t   nbytes,
3279
                    l_int32  flag)
3280
0
{
3281
0
char    lastchar;
3282
0
size_t  len;
3283
3284
0
    if (!path)
3285
0
        return ERROR_INT("path not defined", __func__, 1);
3286
0
    if (flag != L_ADD_TRAIL_SLASH && flag != L_REMOVE_TRAIL_SLASH)
3287
0
        return ERROR_INT("invalid flag", __func__, 1);
3288
3289
0
    len = strlen(path);
3290
0
    lastchar = path[len - 1];
3291
0
    if (flag == L_ADD_TRAIL_SLASH && lastchar != '/' && len < nbytes - 2) {
3292
0
        path[len] = '/';
3293
0
        path[len + 1] = '\0';
3294
0
    } else if (flag == L_REMOVE_TRAIL_SLASH && lastchar == '/') {
3295
0
        path[len - 1] = '\0';
3296
0
    }
3297
0
    return 0;
3298
0
}
3299
3300
3301
/*!
3302
 * \brief   l_makeTempFilename()
3303
 *
3304
 * \return  fname : heap allocated filename; returns NULL on failure.
3305
 *
3306
 * <pre>
3307
 * Notes:
3308
 *      (1) On unix, this makes a filename of the form
3309
 *               "/tmp/lept.XXXXXX",
3310
 *          where each X is a random character.
3311
 *      (2) On Windows, this makes a filename of the form
3312
 *               "/[Temp]/lp.XXXXXX".
3313
 *      (3) On all systems, this fails if the file is not writable.
3314
 *      (4) Safest usage is to write to a subdirectory in debug code.
3315
 *      (5) The returned filename must be freed by the caller, using lept_free.
3316
 *      (6) The tail of the filename has a '.', so that cygwin interprets
3317
 *          the file as having an extension.  Otherwise, cygwin assumes it
3318
 *          is an executable and appends ".exe" to the filename.
3319
 *      (7) On unix, whenever possible use tmpfile() instead.  tmpfile()
3320
 *          hides the file name, returns a stream opened for write,
3321
 *          and deletes the temp file when the stream is closed.
3322
 * </pre>
3323
 */
3324
char *
3325
l_makeTempFilename(void)
3326
0
{
3327
0
char  dirname[240];
3328
3329
0
    if (makeTempDirname(dirname, sizeof(dirname), NULL) == 1)
3330
0
        return (char *)ERROR_PTR("failed to make dirname", __func__, NULL);
3331
3332
0
#ifndef _WIN32
3333
0
{
3334
0
    char    *pattern;
3335
0
    l_int32  fd;
3336
0
    pattern = stringConcatNew(dirname, "/lept.XXXXXX", NULL);
3337
0
    fd = mkstemp(pattern);
3338
0
    if (fd == -1) {
3339
0
        LEPT_FREE(pattern);
3340
0
        return (char *)ERROR_PTR("mkstemp failed", __func__, NULL);
3341
0
    }
3342
0
    close(fd);
3343
0
    return pattern;
3344
0
}
3345
#else
3346
{
3347
    char  fname[MAX_PATH];
3348
    FILE *fp;
3349
    if (GetTempFileNameA(dirname, "lp.", 0, fname) == 0)
3350
        return (char *)ERROR_PTR("GetTempFileName failed", __func__, NULL);
3351
    if ((fp = fopen(fname, "wb")) == NULL)
3352
        return (char *)ERROR_PTR("file cannot be written to", __func__, NULL);
3353
    fclose(fp);
3354
    return stringNew(fname);
3355
}
3356
#endif  /*  ~ _WIN32 */
3357
0
}
3358
3359
3360
/*!
3361
 * \brief   extractNumberFromFilename()
3362
 *
3363
 * \param[in]    fname
3364
 * \param[in]    numpre    number of characters before the digits to be found
3365
 * \param[in]    numpost   number of characters after the digits to be found
3366
 * \return  num number embedded in the filename; -1 on error or if
3367
 *                   not found
3368
 *
3369
 * <pre>
3370
 * Notes:
3371
 *      (1) The number is to be found in the basename, which is the
3372
 *          filename without either the directory or the last extension.
3373
 *      (2) When a number is found, it is non-negative.  If no number
3374
 *          is found, this returns -1, without an error message.  The
3375
 *          caller needs to check.
3376
 * </pre>
3377
 */
3378
l_int32
3379
extractNumberFromFilename(const char  *fname,
3380
                          l_int32      numpre,
3381
                          l_int32      numpost)
3382
0
{
3383
0
char    *tail, *basename;
3384
0
l_int32  len, nret, num;
3385
3386
0
    if (!fname)
3387
0
        return ERROR_INT("fname not defined", __func__, -1);
3388
3389
0
    splitPathAtDirectory(fname, NULL, &tail);
3390
0
    splitPathAtExtension(tail, &basename, NULL);
3391
0
    LEPT_FREE(tail);
3392
3393
0
    len = strlen(basename);
3394
0
    if (numpre + numpost > len - 1) {
3395
0
        LEPT_FREE(basename);
3396
0
        return ERROR_INT("numpre + numpost too big", __func__, -1);
3397
0
    }
3398
3399
0
    basename[len - numpost] = '\0';
3400
0
    nret = sscanf(basename + numpre, "%d", &num);
3401
0
    LEPT_FREE(basename);
3402
3403
0
    if (nret == 1)
3404
0
        return num;
3405
0
    else
3406
0
        return -1;  /* not found */
3407
0
}