LCOV - coverage report for master 70ed9daf - lib/util/charset/util

LCOV - code coverage report

Current view:	top level - lib/util/charset - util_str.c (source / functions)		Hit	Total	Coverage
Test:	coverage report for master 70ed9daf	Lines:	222	252	88.1 %
Date:	2024-01-11 09:59:51	Functions:	20	20	100.0 %

          Line data    Source code

       1             : /*
       2             :    Unix SMB/CIFS implementation.
       3             :    Samba utility functions
       4             :    Copyright (C) Andrew Tridgell 1992-2001
       5             :    Copyright (C) Simo Sorce 2001
       6             :    Copyright (C) Andrew Bartlett 2011
       7             :    Copyright (C) Jeremy Allison  1992-2007
       8             :    Copyright (C) Martin Pool     2003
       9             :    Copyright (C) James Peach     2006
      10             : 
      11             :    This program is free software; you can redistribute it and/or modify
      12             :    it under the terms of the GNU General Public License as published by
      13             :    the Free Software Foundation; either version 3 of the License, or
      14             :    (at your option) any later version.
      15             : 
      16             :    This program is distributed in the hope that it will be useful,
      17             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      18             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      19             :    GNU General Public License for more details.
      20             : 
      21             :    You should have received a copy of the GNU General Public License
      22             :    along with this program.  If not, see <http://www.gnu.org/licenses/>.
      23             : */
      24             : 
      25             : #include "replace.h"
      26             : #include "system/locale.h"
      27             : #include "charset.h"
      28             : #include "lib/util/fault.h"
      29             : 
      30             : #ifdef strcasecmp
      31             : #undef strcasecmp
      32             : #endif
      33             : #ifdef strncasecmp
      34             : #undef strncasecmp
      35             : #endif
      36             : 
      37             : 
      38             : /**
      39             :  Case insensitive string comparison, handle specified for testing
      40             : **/
      41   341816918 : _PUBLIC_ int strcasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
      42             :                                  const char *s1, const char *s2)
      43             : {
      44   341816918 :         codepoint_t c1=0, c2=0;
      45   341816918 :         codepoint_t u1=0, u2=0;
      46   341816918 :         codepoint_t l1=0, l2=0;
      47     2215535 :         size_t size1, size2;
      48             : 
      49             :         /* handle null ptr comparisons to simplify the use in qsort */
      50   341816918 :         if (s1 == s2) return 0;
      51   341816243 :         if (s1 == NULL) return -1;
      52   341816241 :         if (s2 == NULL) return 1;
      53             : 
      54  1152976984 :         while (*s1 && *s2) {
      55  1135830640 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
      56  1135830640 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
      57             : 
      58  1135830640 :                 if (c1 == INVALID_CODEPOINT ||
      59     3951972 :                     c2 == INVALID_CODEPOINT) {
      60           9 :                         return strcasecmp(s1, s2);
      61             :                 }
      62             : 
      63  1135830631 :                 s1 += size1;
      64  1135830631 :                 s2 += size2;
      65             : 
      66  1135830631 :                 if (c1 == c2) {
      67   809554531 :                         continue;
      68             :                 }
      69             : 
      70   326276100 :                 u1 = toupper_m(c1);
      71   326276100 :                 u2 = toupper_m(c2);
      72   326276100 :                 if (u1 == u2) {
      73     1606214 :                         continue;
      74             :                 }
      75             : 
      76   324669886 :                 l1 = tolower_m(c1);
      77   324669886 :                 l2 = tolower_m(c2);
      78   324669886 :                 if (l1 == l2) {
      79           0 :                         continue;
      80             :                 }
      81             : 
      82   324669886 :                 return l1 - l2;
      83             :         }
      84             : 
      85    17146344 :         return *s1 - *s2;
      86             : }
      87             : 
      88             : /**
      89             :  Case insensitive string comparison
      90             : **/
      91   341816900 : _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
      92             : {
      93   341816900 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
      94   341816900 :         return strcasecmp_m_handle(iconv_handle, s1, s2);
      95             : }
      96             : 
      97             : /**
      98             :  Case insensitive string comparison, length limited, handle specified for
      99             :  testing
     100             : **/
     101     7443423 : _PUBLIC_ int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
     102             :                                   const char *s1, const char *s2, size_t n)
     103             : {
     104     7443423 :         codepoint_t c1=0, c2=0;
     105     7443423 :         codepoint_t u1=0, u2=0;
     106     7443423 :         codepoint_t l1=0, l2=0;
     107        8512 :         size_t size1, size2;
     108             : 
     109             :         /* handle null ptr comparisons to simplify the use in qsort */
     110     7443423 :         if (s1 == s2) return 0;
     111     7443133 :         if (s1 == NULL) return -1;
     112     7443132 :         if (s2 == NULL) return 1;
     113             : 
     114    18663257 :         while (*s1 && *s2 && n) {
     115    17821242 :                 n--;
     116             : 
     117    17821242 :                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
     118    17821242 :                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
     119             : 
     120    17821242 :                 if (c1 == INVALID_CODEPOINT ||
     121       25188 :                     c2 == INVALID_CODEPOINT) {
     122             :                         /*
     123             :                          * n was specified in characters,
     124             :                          * now we must convert it to bytes.
     125             :                          * As bytes are the smallest
     126             :                          * character unit, the following
     127             :                          * increment and strncasecmp is always
     128             :                          * safe.
     129             :                          *
     130             :                          * The source string was already known
     131             :                          * to be n characters long, so we are
     132             :                          * guaranteed to be able to look at the
     133             :                          * (n remaining + size1) bytes from the
     134             :                          * s1 position).
     135             :                          */
     136           1 :                         n += size1;
     137           1 :                         return strncasecmp(s1, s2, n);
     138             :                 }
     139             : 
     140    17821241 :                 s1 += size1;
     141    17821241 :                 s2 += size2;
     142             : 
     143    17821241 :                 if (c1 == c2) {
     144    11199479 :                         continue;
     145             :                 }
     146             : 
     147     6621762 :                 u1 = toupper_m(c1);
     148     6621762 :                 u2 = toupper_m(c2);
     149     6621762 :                 if (u1 == u2) {
     150       20647 :                         continue;
     151             :                 }
     152             : 
     153     6601115 :                 l1 = tolower_m(c1);
     154     6601115 :                 l2 = tolower_m(c2);
     155     6601115 :                 if (l1 == l2) {
     156           0 :                         continue;
     157             :                 }
     158             : 
     159     6601115 :                 return l1 - l2;
     160             :         }
     161             : 
     162      842015 :         if (n == 0) {
     163      833213 :                 return 0;
     164             :         }
     165             : 
     166        6554 :         return *s1 - *s2;
     167             : }
     168             : 
     169             : /**
     170             :  Case insensitive string comparison, length limited
     171             : **/
     172     7443411 : _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
     173             : {
     174     7443411 :         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
     175     7443411 :         return strncasecmp_m_handle(iconv_handle, s1, s2, n);
     176             : }
     177             : 
     178             : /**
     179             :  * Compare 2 strings.
     180             :  *
     181             :  * @note The comparison is case-insensitive.
     182             :  **/
     183       95479 : _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
     184             : {
     185       95479 :         return strcasecmp_m(s1,s2) == 0;
     186             : }
     187             : 
     188             : /**
     189             :  Compare 2 strings (case sensitive).
     190             : **/
     191     3300908 : _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
     192             : {
     193     3300908 :         if (s1 == s2)
     194          40 :                 return true;
     195     3300860 :         if (!s1 || !s2)
     196           0 :                 return false;
     197             : 
     198     3300858 :         return strcmp(s1,s2) == 0;
     199             : }
     200             : 
     201             : /**
     202             :  * Calculate the number of units (8 or 16-bit, depending on the
     203             :  * destination charset) that would be needed to convert the input
     204             :  * string, which is expected to be in src_charset encoding, to the
     205             :  * destination charset (which should be a unicode charset).
     206             :  */
     207    40826699 : _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
     208             :                                     const char *s, charset_t src_charset, charset_t dst_charset)
     209             : {
     210    40826699 :         size_t count = 0;
     211             : 
     212             : #ifdef DEVELOPER
     213    40826699 :         switch (dst_charset) {
     214           0 :         case CH_DOS:
     215             :         case CH_UNIX:
     216           0 :                 smb_panic("cannot call strlen_m_ext() with a variable dest charset (must be UTF16* or UTF8)");
     217    39973969 :         default:
     218    40826699 :                 break;
     219             :         }
     220             : 
     221    40826699 :         switch (src_charset) {
     222           0 :         case CH_UTF16LE:
     223             :         case CH_UTF16BE:
     224           0 :                 smb_panic("cannot call strlen_m_ext() with a UTF16 src charset (must be DOS, UNIX, DISPLAY or UTF8)");
     225    39973969 :         default:
     226    40826699 :                 break;
     227             :         }
     228             : #endif
     229    40826699 :         if (!s) {
     230       66084 :                 return 0;
     231             :         }
     232             : 
     233  1175380712 :         while (*s && !(((uint8_t)*s) & 0x80)) {
     234  1134623865 :                 s++;
     235  1134623865 :                 count++;
     236             :         }
     237             : 
     238    40756847 :         if (!*s) {
     239    39896024 :                 return count;
     240             :         }
     241             : 
     242      575596 :         while (*s) {
     243        3536 :                 size_t c_size;
     244      563707 :                 codepoint_t c = next_codepoint_handle_ext(ic, s, strnlen(s, 5),
     245             :                                                           src_charset, &c_size);
     246      563707 :                 s += c_size;
     247             : 
     248      563707 :                 switch (dst_charset) {
     249      555682 :                 case CH_UTF16LE:
     250             :                 case CH_UTF16BE:
     251             :                 case CH_UTF16MUNGED:
     252      555682 :                         if (c < 0x10000) {
     253             :                                 /* Unicode char fits into 16 bits. */
     254      492815 :                                 count += 1;
     255             :                         } else {
     256             :                                 /* Double-width unicode char - 32 bits. */
     257       62867 :                                 count += 2;
     258             :                         }
     259      553391 :                         break;
     260        8025 :                 case CH_UTF8:
     261             :                         /*
     262             :                          * this only checks ranges, and does not
     263             :                          * check for invalid codepoints
     264             :                          */
     265        8025 :                         if (c < 0x80) {
     266        6116 :                                 count += 1;
     267        1909 :                         } else if (c < 0x800) {
     268         871 :                                 count += 2;
     269        1038 :                         } else if (c < 0x10000) {
     270        1038 :                                 count += 3;
     271             :                         } else {
     272           0 :                                 count += 4;
     273             :                         }
     274        6780 :                         break;
     275           0 :                 default:
     276             :                         /*
     277             :                          * non-unicode encoding:
     278             :                          * assume that each codepoint fits into
     279             :                          * one unit in the destination encoding.
     280             :                          */
     281           0 :                         count += 1;
     282             :                 }
     283             :         }
     284             : 
     285       11861 :         return count;
     286             : }
     287             : 
     288             : /**
     289             :  * Calculate the number of units (8 or 16-bit, depending on the
     290             :  * destination charset) that would be needed to convert the input
     291             :  * string, which is expected to be in src_charset encoding, to the
     292             :  * destination charset (which should be a unicode charset).
     293             :  */
     294    40826687 : _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
     295             : {
     296    40826687 :         struct smb_iconv_handle *ic = get_iconv_handle();
     297    40826687 :         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
     298             : }
     299             : 
     300    25231382 : _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
     301             :                                   const charset_t dst_charset)
     302             : {
     303    25231382 :         if (!s) {
     304       94352 :                 return 0;
     305             :         }
     306    25136674 :         return strlen_m_ext(s, src_charset, dst_charset) + 1;
     307             : }
     308             : 
     309      931654 : _PUBLIC_ size_t strlen_m_ext_term_null(const char *s,
     310             :                                        const charset_t src_charset,
     311             :                                        const charset_t dst_charset)
     312             : {
     313        1952 :         size_t len;
     314      931654 :         if (!s) {
     315         972 :                 return 0;
     316             :         }
     317      930681 :         len = strlen_m_ext(s, src_charset, dst_charset);
     318      930681 :         if (len == 0) {
     319      608514 :                 return 0;
     320             :         }
     321             : 
     322      321938 :         return len+1;
     323             : }
     324             : 
     325             : /**
     326             :  * Calculate the number of 16-bit units that would be needed to convert
     327             :  * the input string, which is expected to be in CH_UNIX encoding, to UTF16.
     328             :  *
     329             :  * This will be the same as the number of bytes in a string for single
     330             :  * byte strings, but will be different for multibyte.
     331             :  */
     332    14759326 : _PUBLIC_ size_t strlen_m(const char *s)
     333             : {
     334    14759326 :         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
     335             : }
     336             : 
     337             : /**
     338             :    Work out the number of multibyte chars in a string, including the NULL
     339             :    terminator.
     340             : **/
     341     2242605 : _PUBLIC_ size_t strlen_m_term(const char *s)
     342             : {
     343     2242605 :         return strlen_m_ext_term(s, CH_UNIX, CH_UTF16LE);
     344             : }
     345             : 
     346             : /*
     347             :  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
     348             :  * if a string is there, include the terminator.
     349             :  */
     350             : 
     351      931654 : _PUBLIC_ size_t strlen_m_term_null(const char *s)
     352             : {
     353      931654 :         return strlen_m_ext_term_null(s, CH_UNIX, CH_UTF16LE);
     354             : }
     355             : 
     356             : /**
     357             :  Strchr and strrchr_m are a bit complex on general multi-byte strings.
     358             : **/
     359   313802388 : _PUBLIC_ char *strchr_m(const char *src, char c)
     360             : {
     361     2043569 :         const char *s;
     362   313802388 :         struct smb_iconv_handle *ic = get_iconv_handle();
     363   313802388 :         if (src == NULL) {
     364           0 :                 return NULL;
     365             :         }
     366             :         /* characters below 0x3F are guaranteed to not appear in
     367             :            non-initial position in multi-byte charsets */
     368   313802388 :         if ((c & 0xC0) == 0) {
     369    92490428 :                 return strchr(src, c);
     370             :         }
     371             : 
     372             :         /* this is quite a common operation, so we want it to be
     373             :            fast. We optimise for the ascii case, knowing that all our
     374             :            supported multi-byte character sets are ascii-compatible
     375             :            (ie. they match for the first 128 chars) */
     376             : 
     377  1539228455 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     378  1317979807 :                 if (*s == c)
     379       63312 :                         return discard_const_p(char, s);
     380             :         }
     381             : 
     382   221248648 :         if (!*s)
     383   220035064 :                 return NULL;
     384             : 
     385             : #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
     386             :         /* With compose characters we must restart from the beginning. JRA. */
     387             :         s = src;
     388             : #endif
     389             : 
     390           4 :         while (*s) {
     391           3 :                 size_t size;
     392           3 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     393           3 :                 if (c2 == c) {
     394           0 :                         return discard_const_p(char, s);
     395             :                 }
     396           3 :                 s += size;
     397             :         }
     398             : 
     399           0 :         return NULL;
     400             : }
     401             : 
     402             : /**
     403             :  * Multibyte-character version of strrchr
     404             :  */
     405     7823170 : _PUBLIC_ char *strrchr_m(const char *s, char c)
     406             : {
     407       38605 :         struct smb_iconv_handle *ic;
     408     7823170 :         char *ret = NULL;
     409             : 
     410     7823170 :         if (s == NULL) {
     411           0 :                 return NULL;
     412             :         }
     413             : 
     414             :         /* characters below 0x3F are guaranteed to not appear in
     415             :            non-initial position in multi-byte charsets */
     416     7823170 :         if ((c & 0xC0) == 0) {
     417     7772027 :                 return strrchr(s, c);
     418             :         }
     419             : 
     420             :         /* this is quite a common operation, so we want it to be
     421             :            fast. We optimise for the ascii case, knowing that all our
     422             :            supported multi-byte character sets are ascii-compatible
     423             :            (ie. they match for the first 128 chars). Also, in Samba
     424             :            we only search for ascii characters in 'c' and that
     425             :            in all mb character sets with a compound character
     426             :            containing c, if 'c' is not a match at position
     427             :            p, then p[-1] > 0x7f. JRA. */
     428             : 
     429             :         {
     430       51143 :                 size_t len = strlen(s);
     431       51143 :                 const char *cp = s;
     432       51143 :                 bool got_mb = false;
     433             : 
     434       51143 :                 if (len == 0)
     435         106 :                         return NULL;
     436       51037 :                 cp += (len - 1);
     437        1694 :                 do {
     438      336004 :                         if (c == *cp) {
     439             :                                 /* Could be a match. Part of a multibyte ? */
     440       33775 :                                 if ((cp > s) &&
     441       31917 :                                         (((unsigned char)cp[-1]) & 0x80)) {
     442             :                                         /* Yep - go slow :-( */
     443           0 :                                         got_mb = true;
     444           0 :                                         break;
     445             :                                 }
     446             :                                 /* No - we have a match ! */
     447       33594 :                                 return discard_const_p(char , cp);
     448             :                         }
     449      302229 :                 } while (cp-- != s);
     450       17227 :                 if (!got_mb)
     451       17227 :                         return NULL;
     452             :         }
     453             : 
     454           0 :         ic = get_iconv_handle();
     455             : 
     456           0 :         while (*s) {
     457           0 :                 size_t size;
     458           0 :                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
     459           0 :                 if (c2 == c) {
     460           0 :                         ret = discard_const_p(char, s);
     461             :                 }
     462           0 :                 s += size;
     463             :         }
     464             : 
     465           0 :         return ret;
     466             : }
     467             : 
     468             : /**
     469             :   return True if any (multi-byte) character is lower case
     470             : */
     471          35 : _PUBLIC_ bool strhaslower_handle(struct smb_iconv_handle *ic,
     472             :                                  const char *string)
     473             : {
     474         963 :         while (*string) {
     475         950 :                 size_t c_size;
     476         950 :                 codepoint_t s;
     477         950 :                 codepoint_t t;
     478             : 
     479         950 :                 s = next_codepoint_handle(ic, string, &c_size);
     480         950 :                 string += c_size;
     481             : 
     482         950 :                 t = toupper_m(s);
     483             : 
     484         950 :                 if (s != t) {
     485          22 :                         return true; /* that means it has lower case chars */
     486             :                 }
     487             :         }
     488             : 
     489           0 :         return false;
     490             : }
     491             : 
     492          17 : _PUBLIC_ bool strhaslower(const char *string)
     493             : {
     494          17 :         struct smb_iconv_handle *ic = get_iconv_handle();
     495          17 :         return strhaslower_handle(ic, string);
     496             : }
     497             : 
     498             : /**
     499             :   return True if any (multi-byte) character is upper case
     500             : */
     501          35 : _PUBLIC_ bool strhasupper_handle(struct smb_iconv_handle *ic,
     502             :                                  const char *string)
     503             : {
     504         954 :         while (*string) {
     505         941 :                 size_t c_size;
     506         941 :                 codepoint_t s;
     507         941 :                 codepoint_t t;
     508             : 
     509         941 :                 s = next_codepoint_handle(ic, string, &c_size);
     510         941 :                 string += c_size;
     511             : 
     512         941 :                 t = tolower_m(s);
     513             : 
     514         941 :                 if (s != t) {
     515          22 :                         return true; /* that means it has upper case chars */
     516             :                 }
     517             :         }
     518             : 
     519           0 :         return false;
     520             : }
     521             : 
     522          17 : _PUBLIC_ bool strhasupper(const char *string)
     523             : {
     524          17 :         struct smb_iconv_handle *ic = get_iconv_handle();
     525          17 :         return strhasupper_handle(ic, string);
     526             : }
     527             : 
     528             : /***********************************************************************
     529             :  strstr_m - We convert via ucs2 for now.
     530             : ***********************************************************************/
     531             : 
     532     2376452 : char *strstr_m(const char *src, const char *findstr)
     533             : {
     534     2376452 :         TALLOC_CTX *mem_ctx = NULL;
     535        9793 :         smb_ucs2_t *p;
     536        9793 :         smb_ucs2_t *src_w, *find_w;
     537        9793 :         const char *s;
     538        9793 :         char *s2;
     539     2376452 :         char *retp = NULL;
     540     2376452 :         size_t converted_size, findstr_len = 0;
     541             : 
     542             :         /* for correctness */
     543     2376452 :         if (!findstr[0]) {
     544           0 :                 return discard_const_p(char, src);
     545             :         }
     546             : 
     547             :         /* Samba does single character findstr calls a *lot*. */
     548     2376450 :         if (findstr[1] == '\0')
     549      105717 :                 return strchr_m(src, *findstr);
     550             : 
     551             :         /* We optimise for the ascii case, knowing that all our
     552             :            supported multi-byte character sets are ascii-compatible
     553             :            (ie. they match for the first 128 chars) */
     554             : 
     555    44387878 :         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
     556    42909402 :                 if (*s == *findstr) {
     557     2265159 :                         if (!findstr_len)
     558     1391651 :                                 findstr_len = strlen(findstr);
     559             : 
     560     2265159 :                         if (strncmp(s, findstr, findstr_len) == 0) {
     561      792257 :                                 return discard_const_p(char, s);
     562             :                         }
     563             :                 }
     564             :         }
     565             : 
     566     1478476 :         if (!*s)
     567     1474342 :                 return NULL;
     568             : 
     569             : #if 1 /* def BROKEN_UNICODE_COMPOSE_CHARACTERS */
     570             :         /* 'make check' fails unless we do this */
     571             : 
     572             :         /* With compose characters we must restart from the beginning. JRA. */
     573           9 :         s = src;
     574             : #endif
     575             : 
     576             :         /*
     577             :          * Use get_iconv_handle() just as a non-NULL talloc ctx. In
     578             :          * case we leak memory, this should then be more obvious in
     579             :          * the talloc report.
     580             :          */
     581           9 :         mem_ctx = talloc_new(get_iconv_handle());
     582           9 :         if (mem_ctx == NULL) {
     583           0 :                 return NULL;
     584             :         }
     585             : 
     586           9 :         if (!push_ucs2_talloc(mem_ctx, &src_w, src, &converted_size)) {
     587           0 :                 goto done;
     588             :         }
     589             : 
     590           9 :         if (!push_ucs2_talloc(mem_ctx, &find_w, findstr, &converted_size)) {
     591           3 :                 goto done;
     592             :         }
     593             : 
     594           6 :         p = strstr_w(src_w, find_w);
     595             : 
     596           6 :         if (!p) {
     597           3 :                 goto done;
     598             :         }
     599             : 
     600           3 :         *p = 0;
     601           3 :         if (!pull_ucs2_talloc(mem_ctx, &s2, src_w, &converted_size)) {
     602           0 :                 goto done;
     603             :         }
     604           3 :         retp = discard_const_p(char, (s+strlen(s2)));
     605           9 : done:
     606           9 :         TALLOC_FREE(mem_ctx);
     607           9 :         return retp;
     608             : }

Generated by: LCOV version 1.14