1 /**
2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
3 *
4 * Copyright (c) 2000-2004 Anton Altaparmakov
5 * Copyright (c) 2002-2009 Szabolcs Szakacsits
6 * Copyright (c) 2008-2015 Jean-Pierre Andre
7 * Copyright (c) 2008 Bernhard Kaindl
8 *
9 * This program/include file is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as published
11 * by the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program/include file is distributed in the hope that it will be
15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program (in the main directory of the NTFS-3G
21 * distribution in the file COPYING); if not, write to the Free Software
22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28
29 #ifdef HAVE_STDIO_H
30 #include <stdio.h>
31 #endif
32 #ifdef HAVE_STDLIB_H
33 #include <stdlib.h>
34 #endif
35 #ifdef HAVE_WCHAR_H
36 #include <wchar.h>
37 #endif
38 #ifdef HAVE_STRING_H
39 #include <string.h>
40 #endif
41 #ifdef HAVE_ERRNO_H
42 #include <errno.h>
43 #endif
44 #ifdef HAVE_LOCALE_H
45 #include <locale.h>
46 #endif
47
48 #if defined(__APPLE__) || defined(__DARWIN__)
49 #ifdef ENABLE_NFCONV
50 #include <CoreFoundation/CoreFoundation.h>
51 #endif /* ENABLE_NFCONV */
52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
53
54 #include "compat.h"
55 #include "attrib.h"
56 #include "types.h"
57 #include "unistr.h"
58 #include "debug.h"
59 #include "logging.h"
60 #include "misc.h"
61
62 #ifndef ALLOW_BROKEN_UNICODE
63 /* Erik allowing broken UTF-16 surrogate pairs and U+FFFE and U+FFFF by default,
64 * open to debate. */
65 #define ALLOW_BROKEN_UNICODE 1
66 #endif /* !defined(ALLOW_BROKEN_UNICODE) */
67
68 /*
69 * IMPORTANT
70 * =========
71 *
72 * All these routines assume that the Unicode characters are in little endian
73 * encoding inside the strings!!!
74 */
75
76 static int use_utf8 = 1; /* use UTF-8 encoding for file names */
77
78 #if defined(__APPLE__) || defined(__DARWIN__)
79 #ifdef ENABLE_NFCONV
80 /**
81 * This variable controls whether or not automatic normalization form conversion
82 * should be performed when translating NTFS unicode file names to UTF-8.
83 * Defaults to on, but can be controlled from the outside using the function
84 * int ntfs_macosx_normalize_filenames(int normalize);
85 */
86 static int nfconvert_utf8 = 1;
87 #endif /* ENABLE_NFCONV */
88 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
89
90 /*
91 * This is used by the name collation functions to quickly determine what
92 * characters are (in)valid.
93 */
94 #if 0
95 static const u8 legal_ansi_char_array[0x40] = {
96 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
97 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
98
99 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
100 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
101
102 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
103 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
104
105 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
106 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
107 };
108 #endif
109
110 /**
111 * ntfs_names_are_equal - compare two Unicode names for equality
112 * @s1: name to compare to @s2
113 * @s1_len: length in Unicode characters of @s1
114 * @s2: name to compare to @s1
115 * @s2_len: length in Unicode characters of @s2
116 * @ic: ignore case bool
117 * @upcase: upcase table (only if @ic == IGNORE_CASE)
118 * @upcase_size: length in Unicode characters of @upcase (if present)
119 *
120 * Compare the names @s1 and @s2 and return TRUE (1) if the names are
121 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
122 * the @upcase table is used to perform a case insensitive comparison.
123 */
ntfs_names_are_equal(const ntfschar * s1,size_t s1_len,const ntfschar * s2,size_t s2_len,const IGNORE_CASE_BOOL ic,const ntfschar * upcase,const u32 upcase_size)124 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
125 const ntfschar *s2, size_t s2_len,
126 const IGNORE_CASE_BOOL ic,
127 const ntfschar *upcase, const u32 upcase_size)
128 {
129 if (s1_len != s2_len)
130 return FALSE;
131 if (!s1_len)
132 return TRUE;
133 if (ic == CASE_SENSITIVE)
134 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
135 return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
136 TRUE;
137 }
138
139 /*
140 * ntfs_names_full_collate() fully collate two Unicode names
141 *
142 * @name1: first Unicode name to compare
143 * @name1_len: length of first Unicode name to compare
144 * @name2: second Unicode name to compare
145 * @name2_len: length of second Unicode name to compare
146 * @ic: either CASE_SENSITIVE or IGNORE_CASE (see below)
147 * @upcase: upcase table
148 * @upcase_len: upcase table size
149 *
150 * If @ic is CASE_SENSITIVE, then the names are compared primarily ignoring
151 * case, but if the names are equal ignoring case, then they are compared
152 * case-sensitively. As an example, "abc" would collate before "BCD" (since
153 * "abc" and "BCD" differ ignoring case and 'A' < 'B') but after "ABC" (since
154 * "ABC" and "abc" are equal ignoring case and 'A' < 'a'). This matches the
155 * collation order of filenames as indexed in NTFS directories.
156 *
157 * If @ic is IGNORE_CASE, then the names are only compared case-insensitively
158 * and are considered to match if and only if they are equal ignoring case.
159 *
160 * Returns:
161 * -1 if the first name collates before the second one,
162 * 0 if the names match, or
163 * 1 if the second name collates before the first one
164 */
ntfs_names_full_collate(const ntfschar * name1,const u32 name1_len,const ntfschar * name2,const u32 name2_len,const IGNORE_CASE_BOOL ic,const ntfschar * upcase,const u32 upcase_len)165 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len,
166 const ntfschar *name2, const u32 name2_len,
167 const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
168 const u32 upcase_len)
169 {
170 u32 cnt;
171 u16 c1, c2;
172 u16 u1, u2;
173
174 #ifdef DEBUG
175 if (!name1 || !name2 || !upcase || !upcase_len) {
176 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
177 exit(1);
178 }
179 #endif
180 cnt = min(name1_len, name2_len);
181 if (cnt > 0) {
182 if (ic == CASE_SENSITIVE) {
183 while (--cnt && (*name1 == *name2)) {
184 name1++;
185 name2++;
186 }
187 u1 = c1 = le16_to_cpu(*name1);
188 u2 = c2 = le16_to_cpu(*name2);
189 if (u1 < upcase_len)
190 u1 = le16_to_cpu(upcase[u1]);
191 if (u2 < upcase_len)
192 u2 = le16_to_cpu(upcase[u2]);
193 if ((u1 == u2) && cnt)
194 do {
195 name1++;
196 u1 = le16_to_cpu(*name1);
197 name2++;
198 u2 = le16_to_cpu(*name2);
199 if (u1 < upcase_len)
200 u1 = le16_to_cpu(upcase[u1]);
201 if (u2 < upcase_len)
202 u2 = le16_to_cpu(upcase[u2]);
203 } while ((u1 == u2) && --cnt);
204 if (u1 < u2)
205 return -1;
206 if (u1 > u2)
207 return 1;
208 if (name1_len < name2_len)
209 return -1;
210 if (name1_len > name2_len)
211 return 1;
212 if (c1 < c2)
213 return -1;
214 if (c1 > c2)
215 return 1;
216 } else {
217 do {
218 u1 = le16_to_cpu(*name1);
219 name1++;
220 u2 = le16_to_cpu(*name2);
221 name2++;
222 if (u1 < upcase_len)
223 u1 = le16_to_cpu(upcase[u1]);
224 if (u2 < upcase_len)
225 u2 = le16_to_cpu(upcase[u2]);
226 } while ((u1 == u2) && --cnt);
227 if (u1 < u2)
228 return -1;
229 if (u1 > u2)
230 return 1;
231 if (name1_len < name2_len)
232 return -1;
233 if (name1_len > name2_len)
234 return 1;
235 }
236 } else {
237 if (name1_len < name2_len)
238 return -1;
239 if (name1_len > name2_len)
240 return 1;
241 }
242 return 0;
243 }
244
245 /**
246 * ntfs_ucsncmp - compare two little endian Unicode strings
247 * @s1: first string
248 * @s2: second string
249 * @n: maximum unicode characters to compare
250 *
251 * Compare the first @n characters of the Unicode strings @s1 and @s2,
252 * The strings in little endian format and appropriate le16_to_cpu()
253 * conversion is performed on non-little endian machines.
254 *
255 * The function returns an integer less than, equal to, or greater than zero
256 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
257 * to be less than, to match, or be greater than @s2.
258 */
ntfs_ucsncmp(const ntfschar * s1,const ntfschar * s2,size_t n)259 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
260 {
261 u16 c1, c2;
262 size_t i;
263
264 #ifdef DEBUG
265 if (!s1 || !s2) {
266 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
267 exit(1);
268 }
269 #endif
270 for (i = 0; i < n; ++i) {
271 c1 = le16_to_cpu(s1[i]);
272 c2 = le16_to_cpu(s2[i]);
273 if (c1 < c2)
274 return -1;
275 if (c1 > c2)
276 return 1;
277 if (!c1)
278 break;
279 }
280 return 0;
281 }
282
283 /**
284 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
285 * @s1: first string
286 * @s2: second string
287 * @n: maximum unicode characters to compare
288 * @upcase: upcase table
289 * @upcase_size: upcase table size in Unicode characters
290 *
291 * Compare the first @n characters of the Unicode strings @s1 and @s2,
292 * ignoring case. The strings in little endian format and appropriate
293 * le16_to_cpu() conversion is performed on non-little endian machines.
294 *
295 * Each character is uppercased using the @upcase table before the comparison.
296 *
297 * The function returns an integer less than, equal to, or greater than zero
298 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
299 * to be less than, to match, or be greater than @s2.
300 */
ntfs_ucsncasecmp(const ntfschar * s1,const ntfschar * s2,size_t n,const ntfschar * upcase,const u32 upcase_size)301 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
302 const ntfschar *upcase, const u32 upcase_size)
303 {
304 u16 c1, c2;
305 size_t i;
306
307 #ifdef DEBUG
308 if (!s1 || !s2 || !upcase) {
309 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
310 exit(1);
311 }
312 #endif
313 for (i = 0; i < n; ++i) {
314 if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
315 c1 = le16_to_cpu(upcase[c1]);
316 if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
317 c2 = le16_to_cpu(upcase[c2]);
318 if (c1 < c2)
319 return -1;
320 if (c1 > c2)
321 return 1;
322 if (!c1)
323 break;
324 }
325 return 0;
326 }
327
328 /**
329 * ntfs_ucsnlen - determine the length of a little endian Unicode string
330 * @s: pointer to Unicode string
331 * @maxlen: maximum length of string @s
332 *
333 * Return the number of Unicode characters in the little endian Unicode
334 * string @s up to a maximum of maxlen Unicode characters, not including
335 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
336 * and @s + @maxlen, @maxlen is returned.
337 *
338 * This function never looks beyond @s + @maxlen.
339 */
ntfs_ucsnlen(const ntfschar * s,u32 maxlen)340 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
341 {
342 u32 i;
343
344 for (i = 0; i < maxlen; i++) {
345 if (!le16_to_cpu(s[i]))
346 break;
347 }
348 return i;
349 }
350
351 /**
352 * ntfs_ucsndup - duplicate little endian Unicode string
353 * @s: pointer to Unicode string
354 * @maxlen: maximum length of string @s
355 *
356 * Return a pointer to a new little endian Unicode string which is a duplicate
357 * of the string s. Memory for the new string is obtained with ntfs_malloc(3),
358 * and can be freed with free(3).
359 *
360 * A maximum of @maxlen Unicode characters are copied and a terminating
361 * (ntfschar)'\0' little endian Unicode character is added.
362 *
363 * This function never looks beyond @s + @maxlen.
364 *
365 * Return a pointer to the new little endian Unicode string on success and NULL
366 * on failure with errno set to the error code.
367 */
ntfs_ucsndup(const ntfschar * s,u32 maxlen)368 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
369 {
370 ntfschar *dst;
371 u32 len;
372
373 len = ntfs_ucsnlen(s, maxlen);
374 dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
375 if (dst) {
376 memcpy(dst, s, len * sizeof(ntfschar));
377 dst[len] = const_cpu_to_le16(L'\0');
378 }
379 return dst;
380 }
381
382 /**
383 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
384 * @name:
385 * @name_len:
386 * @upcase:
387 * @upcase_len:
388 *
389 * Description...
390 *
391 * Returns:
392 */
ntfs_name_upcase(ntfschar * name,u32 name_len,const ntfschar * upcase,const u32 upcase_len)393 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
394 const u32 upcase_len)
395 {
396 u32 i;
397 u16 u;
398
399 for (i = 0; i < name_len; i++)
400 if ((u = le16_to_cpu(name[i])) < upcase_len)
401 name[i] = upcase[u];
402 }
403
404 /**
405 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
406 */
ntfs_name_locase(ntfschar * name,u32 name_len,const ntfschar * locase,const u32 locase_len)407 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase,
408 const u32 locase_len)
409 {
410 u32 i;
411 u16 u;
412
413 if (locase)
414 for (i = 0; i < name_len; i++)
415 if ((u = le16_to_cpu(name[i])) < locase_len)
416 name[i] = locase[u];
417 }
418
419 /**
420 * ntfs_file_value_upcase - Convert a filename to upper case
421 * @file_name_attr:
422 * @upcase:
423 * @upcase_len:
424 *
425 * Description...
426 *
427 * Returns:
428 */
ntfs_file_value_upcase(FILE_NAME_ATTR * file_name_attr,const ntfschar * upcase,const u32 upcase_len)429 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
430 const ntfschar *upcase, const u32 upcase_len)
431 {
432 ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
433 file_name_attr->file_name_length, upcase, upcase_len);
434 }
435
436 /*
437 NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
438 for now]) for path names, but the Unicode code points need to be
439 converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
440 glibc does this even without a locale in a hard-coded fashion as that
441 appears to be is easy because the low 7-bit ASCII range appears to be
442 available in all charsets but it does not convert anything if
443 there was some error with the locale setup or none set up like
444 when mount is called during early boot where he (by policy) do
445 not use locales (and may be not available if /usr is not yet mounted),
446 so this patch fixes the resulting issues for systems which use
447 UTF-8 and for others, specifying the locale in fstab brings them
448 the encoding which they want.
449
450 If no locale is defined or there was a problem with setting one
451 up and whenever nl_langinfo(CODESET) returns a sting starting with
452 "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
453 the bug where NTFS-3G does not show any path names which include
454 international characters!!! (and also fails on creating them) as result.
455
456 Author: Bernhard Kaindl <bk@suse.de>
457 Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
458 */
459
460 /*
461 * Return the number of bytes in UTF-8 needed (without the terminating null) to
462 * store the given UTF-16LE string.
463 *
464 * On error, -1 is returned, and errno is set to the error code. The following
465 * error codes can be expected:
466 * EILSEQ The input string is not valid UTF-16LE (only possible
467 * if compiled without ALLOW_BROKEN_UNICODE).
468 * ENAMETOOLONG The length of the UTF-8 string in bytes (without the
469 * terminating null) would exceed @outs_len.
470 */
utf16_to_utf8_size(const ntfschar * ins,const int ins_len,int outs_len)471 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
472 {
473 int i, ret = -1;
474 int count = 0;
475 BOOL surrog;
476
477 surrog = FALSE;
478 for (i = 0; i < ins_len && ins[i] && count <= outs_len; i++) {
479 unsigned short c = le16_to_cpu(ins[i]);
480 if (surrog) {
481 if ((c >= 0xdc00) && (c < 0xe000)) {
482 surrog = FALSE;
483 count += 4;
484 } else {
485 #if ALLOW_BROKEN_UNICODE
486 /* The first UTF-16 unit of a surrogate pair has
487 * a value between 0xd800 and 0xdc00. It can be
488 * encoded as an individual UTF-8 sequence if we
489 * cannot combine it with the next UTF-16 unit
490 * unit as a surrogate pair. */
491 surrog = FALSE;
492 count += 3;
493
494 --i;
495 continue;
496 #else
497 goto fail;
498 #endif /* ALLOW_BROKEN_UNICODE */
499 }
500 } else
501 if (c < 0x80)
502 count++;
503 else if (c < 0x800)
504 count += 2;
505 else if (c < 0xd800)
506 count += 3;
507 else if (c < 0xdc00)
508 surrog = TRUE;
509 #if ALLOW_BROKEN_UNICODE
510 else if (c < 0xe000)
511 count += 3;
512 else if (c >= 0xe000)
513 #else
514 else if ((c >= 0xe000) && (c < 0xfffe))
515 #endif /* ALLOW_BROKEN_UNICODE */
516 count += 3;
517 else
518 goto fail;
519 }
520
521 if (surrog && count <= outs_len) {
522 #if ALLOW_BROKEN_UNICODE
523 count += 3; /* ending with a single surrogate */
524 #else
525 goto fail;
526 #endif /* ALLOW_BROKEN_UNICODE */
527 }
528
529 if (count > outs_len) {
530 errno = ENAMETOOLONG;
531 goto out;
532 }
533
534 ret = count;
535 out:
536 return ret;
537 fail:
538 errno = EILSEQ;
539 goto out;
540 }
541
542 /*
543 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
544 * @ins: input utf16 string buffer
545 * @ins_len: length of input string in utf16 characters
546 * @outs: on return contains the (allocated) output multibyte string
547 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL)
548 *
549 * Return -1 with errno set if string has invalid byte sequence or too long.
550 */
ntfs_utf16_to_utf8(const ntfschar * ins,const int ins_len,char ** outs,int outs_len)551 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
552 char **outs, int outs_len)
553 {
554 #if defined(__APPLE__) || defined(__DARWIN__)
555 #ifdef ENABLE_NFCONV
556 char *original_outs_value = *outs;
557 int original_outs_len = outs_len;
558 #endif /* ENABLE_NFCONV */
559 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
560
561 char *t;
562 int i, size, ret = -1;
563 int halfpair;
564
565 halfpair = 0;
566 if (!*outs) {
567 /* If no output buffer was provided, we will allocate one and
568 * limit its length to PATH_MAX. Note: we follow the standard
569 * convention of PATH_MAX including the terminating null. */
570 outs_len = PATH_MAX;
571 }
572
573 /* The size *with* the terminating null is limited to @outs_len,
574 * so the size *without* the terminating null is limited to one less. */
575 size = utf16_to_utf8_size(ins, ins_len, outs_len - 1);
576
577 if (size < 0)
578 goto out;
579
580 if (!*outs) {
581 outs_len = size + 1;
582 *outs = ntfs_malloc(outs_len);
583 if (!*outs)
584 goto out;
585 }
586
587 t = *outs;
588
589 for (i = 0; i < ins_len && ins[i]; i++) {
590 unsigned short c = le16_to_cpu(ins[i]);
591 /* size not double-checked */
592 if (halfpair) {
593 if ((c >= 0xdc00) && (c < 0xe000)) {
594 *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
595 *t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
596 *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
597 *t++ = 0x80 + (c & 63);
598 halfpair = 0;
599 } else {
600 #if ALLOW_BROKEN_UNICODE
601 /* The first UTF-16 unit of a surrogate pair has
602 * a value between 0xd800 and 0xdc00. It can be
603 * encoded as an individual UTF-8 sequence if we
604 * cannot combine it with the next UTF-16 unit
605 * unit as a surrogate pair. */
606 *t++ = 0xe0 | (halfpair >> 12);
607 *t++ = 0x80 | ((halfpair >> 6) & 0x3f);
608 *t++ = 0x80 | (halfpair & 0x3f);
609 halfpair = 0;
610
611 --i;
612 continue;
613 #else
614 goto fail;
615 #endif /* ALLOW_BROKEN_UNICODE */
616 }
617 } else if (c < 0x80) {
618 *t++ = c;
619 } else {
620 if (c < 0x800) {
621 *t++ = (0xc0 | ((c >> 6) & 0x3f));
622 *t++ = 0x80 | (c & 0x3f);
623 } else if (c < 0xd800) {
624 *t++ = 0xe0 | (c >> 12);
625 *t++ = 0x80 | ((c >> 6) & 0x3f);
626 *t++ = 0x80 | (c & 0x3f);
627 } else if (c < 0xdc00)
628 halfpair = c;
629 #if ALLOW_BROKEN_UNICODE
630 else if (c < 0xe000) {
631 *t++ = 0xe0 | (c >> 12);
632 *t++ = 0x80 | ((c >> 6) & 0x3f);
633 *t++ = 0x80 | (c & 0x3f);
634 }
635 #endif /* ALLOW_BROKEN_UNICODE */
636 else if (c >= 0xe000) {
637 *t++ = 0xe0 | (c >> 12);
638 *t++ = 0x80 | ((c >> 6) & 0x3f);
639 *t++ = 0x80 | (c & 0x3f);
640 } else
641 goto fail;
642 }
643 }
644 #if ALLOW_BROKEN_UNICODE
645 if (halfpair) { /* ending with a single surrogate */
646 *t++ = 0xe0 | (halfpair >> 12);
647 *t++ = 0x80 | ((halfpair >> 6) & 0x3f);
648 *t++ = 0x80 | (halfpair & 0x3f);
649 }
650 #endif /* ALLOW_BROKEN_UNICODE */
651 *t = '\0';
652
653 #if defined(__APPLE__) || defined(__DARWIN__)
654 #ifdef ENABLE_NFCONV
655 if(nfconvert_utf8 && (t - *outs) > 0) {
656 char *new_outs = NULL;
657 int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
658 if(new_outs_len >= 0 && new_outs != NULL) {
659 if(original_outs_value != *outs) {
660 // We have allocated outs ourselves.
661 free(*outs);
662 *outs = new_outs;
663 t = *outs + new_outs_len;
664 }
665 else {
666 // We need to copy new_outs into the fixed outs buffer.
667 memset(*outs, 0, original_outs_len);
668 strncpy(*outs, new_outs, original_outs_len-1);
669 t = *outs + original_outs_len;
670 free(new_outs);
671 }
672 }
673 else {
674 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
675 ntfs_log_error(" new_outs=0x%p\n", new_outs);
676 ntfs_log_error(" new_outs_len=%d\n", new_outs_len);
677 }
678 }
679 #endif /* ENABLE_NFCONV */
680 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
681
682 ret = t - *outs;
683 out:
684 return ret;
685 fail:
686 errno = EILSEQ;
687 goto out;
688 }
689
690 /*
691 * Return the amount of 16-bit elements in UTF-16LE needed
692 * (without the terminating null) to store given UTF-8 string.
693 *
694 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
695 *
696 * Note: This does not check whether the input sequence is a valid utf8 string,
697 * and should be used only in context where such check is made!
698 */
utf8_to_utf16_size(const char * s)699 static int utf8_to_utf16_size(const char *s)
700 {
701 int ret = -1;
702 unsigned int byte;
703 size_t count = 0;
704
705 while ((byte = *((const unsigned char *)s++))) {
706 if (++count >= PATH_MAX)
707 goto fail;
708 if (byte >= 0xc0) {
709 if (byte >= 0xF5) {
710 errno = EILSEQ;
711 goto out;
712 }
713 if (!*s)
714 break;
715 if (byte >= 0xC0)
716 s++;
717 if (!*s)
718 break;
719 if (byte >= 0xE0)
720 s++;
721 if (!*s)
722 break;
723 if (byte >= 0xF0) {
724 s++;
725 if (++count >= PATH_MAX)
726 goto fail;
727 }
728 }
729 }
730 ret = count;
731 out:
732 return ret;
733 fail:
734 errno = ENAMETOOLONG;
735 goto out;
736 }
737 /*
738 * This converts one UTF-8 sequence to cpu-endian Unicode value
739 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
740 *
741 * Return the number of used utf8 bytes or -1 with errno set
742 * if sequence is invalid.
743 */
utf8_to_unicode(u32 * wc,const char * s)744 static int utf8_to_unicode(u32 *wc, const char *s)
745 {
746 unsigned int byte = *((const unsigned char *)s);
747
748 /* single byte */
749 if (byte == 0) {
750 *wc = (u32) 0;
751 return 0;
752 } else if (byte < 0x80) {
753 *wc = (u32) byte;
754 return 1;
755 /* double byte */
756 } else if (byte < 0xc2) {
757 goto fail;
758 } else if (byte < 0xE0) {
759 if ((s[1] & 0xC0) == 0x80) {
760 *wc = ((u32)(byte & 0x1F) << 6)
761 | ((u32)(s[1] & 0x3F));
762 return 2;
763 } else
764 goto fail;
765 /* three-byte */
766 } else if (byte < 0xF0) {
767 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
768 *wc = ((u32)(byte & 0x0F) << 12)
769 | ((u32)(s[1] & 0x3F) << 6)
770 | ((u32)(s[2] & 0x3F));
771 /* Check valid ranges */
772 #if ALLOW_BROKEN_UNICODE
773 if (((*wc >= 0x800) && (*wc <= 0xD7FF))
774 || ((*wc >= 0xD800) && (*wc <= 0xDFFF))
775 || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
776 return 3;
777 #else
778 if (((*wc >= 0x800) && (*wc <= 0xD7FF))
779 || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
780 return 3;
781 #endif /* ALLOW_BROKEN_UNICODE */
782 }
783 goto fail;
784 /* four-byte */
785 } else if (byte < 0xF5) {
786 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
787 && ((s[3] & 0xC0) == 0x80)) {
788 *wc = ((u32)(byte & 0x07) << 18)
789 | ((u32)(s[1] & 0x3F) << 12)
790 | ((u32)(s[2] & 0x3F) << 6)
791 | ((u32)(s[3] & 0x3F));
792 /* Check valid ranges */
793 if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
794 return 4;
795 }
796 goto fail;
797 }
798 fail:
799 errno = EILSEQ;
800 return -1;
801 }
802
803 /**
804 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
805 * @ins: input multibyte string buffer
806 * @outs: on return contains the (allocated) output utf16 string
807 * @outs_len: length of output buffer in utf16 characters
808 *
809 * Return -1 with errno set.
810 */
ntfs_utf8_to_utf16(const char * ins,ntfschar ** outs)811 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
812 {
813 #if defined(__APPLE__) || defined(__DARWIN__)
814 #ifdef ENABLE_NFCONV
815 char *new_ins = NULL;
816 if(nfconvert_utf8) {
817 int new_ins_len;
818 new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
819 if(new_ins_len >= 0)
820 ins = new_ins;
821 else
822 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
823 }
824 #endif /* ENABLE_NFCONV */
825 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
826 const char *t = ins;
827 u32 wc;
828 BOOL allocated;
829 ntfschar *outpos;
830 int shorts, ret = -1;
831
832 shorts = utf8_to_utf16_size(ins);
833 if (shorts < 0)
834 goto fail;
835
836 allocated = FALSE;
837 if (!*outs) {
838 *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
839 if (!*outs)
840 goto fail;
841 allocated = TRUE;
842 }
843
844 outpos = *outs;
845
846 while(1) {
847 int m = utf8_to_unicode(&wc, t);
848 if (m <= 0) {
849 if (m < 0) {
850 /* do not leave space allocated if failed */
851 if (allocated) {
852 free(*outs);
853 *outs = (ntfschar*)NULL;
854 }
855 goto fail;
856 }
857 *outpos++ = const_cpu_to_le16(0);
858 break;
859 }
860 if (wc < 0x10000)
861 *outpos++ = cpu_to_le16(wc);
862 else {
863 wc -= 0x10000;
864 *outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
865 *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
866 }
867 t += m;
868 }
869
870 ret = --outpos - *outs;
871 fail:
872 #if defined(__APPLE__) || defined(__DARWIN__)
873 #ifdef ENABLE_NFCONV
874 if(new_ins != NULL)
875 free(new_ins);
876 #endif /* ENABLE_NFCONV */
877 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
878 return ret;
879 }
880
881 /**
882 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
883 * @ins: input Unicode string buffer
884 * @ins_len: length of input string in Unicode characters
885 * @outs: on return contains the (allocated) output multibyte string
886 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL)
887 *
888 * Convert the input little endian, 2-byte Unicode string @ins, of length
889 * @ins_len into the multibyte string format dictated by the current locale.
890 *
891 * If *@outs is NULL, the function allocates the string and the caller is
892 * responsible for calling free(*@outs); when finished with it.
893 *
894 * On success the function returns the number of bytes written to the output
895 * string *@outs (>= 0), not counting the terminating NULL byte. If the output
896 * string buffer was allocated, *@outs is set to it.
897 *
898 * On error, -1 is returned, and errno is set to the error code. The following
899 * error codes can be expected:
900 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
901 * EILSEQ The input string cannot be represented as a multibyte
902 * sequence according to the current locale.
903 * ENAMETOOLONG Destination buffer is too small for input string.
904 * ENOMEM Not enough memory to allocate destination buffer.
905 */
ntfs_ucstombs(const ntfschar * ins,const int ins_len,char ** outs,int outs_len)906 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
907 int outs_len)
908 {
909 char *mbs;
910 int mbs_len;
911 #ifdef MB_CUR_MAX
912 wchar_t wc;
913 int i, o;
914 int cnt = 0;
915 #ifdef HAVE_MBSINIT
916 mbstate_t mbstate;
917 #endif
918 #endif /* MB_CUR_MAX */
919
920 if (!ins || !outs) {
921 errno = EINVAL;
922 return -1;
923 }
924 mbs = *outs;
925 mbs_len = outs_len;
926 if (mbs && !mbs_len) {
927 errno = ENAMETOOLONG;
928 return -1;
929 }
930 if (use_utf8)
931 return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
932 #ifdef MB_CUR_MAX
933 if (!mbs) {
934 mbs_len = (ins_len + 1) * MB_CUR_MAX;
935 mbs = ntfs_malloc(mbs_len);
936 if (!mbs)
937 return -1;
938 }
939 #ifdef HAVE_MBSINIT
940 memset(&mbstate, 0, sizeof(mbstate));
941 #else
942 wctomb(NULL, 0);
943 #endif
944 for (i = o = 0; i < ins_len; i++) {
945 /* Reallocate memory if necessary or abort. */
946 if ((int)(o + MB_CUR_MAX) > mbs_len) {
947 char *tc;
948 if (mbs == *outs) {
949 errno = ENAMETOOLONG;
950 return -1;
951 }
952 tc = ntfs_malloc((mbs_len + 64) & ~63);
953 if (!tc)
954 goto err_out;
955 memcpy(tc, mbs, mbs_len);
956 mbs_len = (mbs_len + 64) & ~63;
957 free(mbs);
958 mbs = tc;
959 }
960 /* Convert the LE Unicode character to a CPU wide character. */
961 wc = (wchar_t)le16_to_cpu(ins[i]);
962 if (!wc)
963 break;
964 /* Convert the CPU endian wide character to multibyte. */
965 #ifdef HAVE_MBSINIT
966 cnt = wcrtomb(mbs + o, wc, &mbstate);
967 #else
968 cnt = wctomb(mbs + o, wc);
969 #endif
970 if (cnt == -1)
971 goto err_out;
972 if (cnt <= 0) {
973 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
974 errno = EINVAL;
975 goto err_out;
976 }
977 o += cnt;
978 }
979 #ifdef HAVE_MBSINIT
980 /* Make sure we are back in the initial state. */
981 if (!mbsinit(&mbstate)) {
982 ntfs_log_debug("Eeek. mbstate not in initial state!\n");
983 errno = EILSEQ;
984 goto err_out;
985 }
986 #endif
987 /* Now write the NULL character. */
988 mbs[o] = '\0';
989 if (*outs != mbs)
990 *outs = mbs;
991 return o;
992 err_out:
993 if (mbs != *outs) {
994 int eo = errno;
995 free(mbs);
996 errno = eo;
997 }
998 #else /* MB_CUR_MAX */
999 errno = EILSEQ;
1000 #endif /* MB_CUR_MAX */
1001 return -1;
1002 }
1003
1004 /**
1005 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
1006 * @ins: input multibyte string buffer
1007 * @outs: on return contains the (allocated) output Unicode string
1008 *
1009 * Convert the input multibyte string @ins, from the current locale into the
1010 * corresponding little endian, 2-byte Unicode string.
1011 *
1012 * The function allocates the string and the caller is responsible for calling
1013 * free(*@outs); when finished with it.
1014 *
1015 * On success the function returns the number of Unicode characters written to
1016 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
1017 * character.
1018 *
1019 * On error, -1 is returned, and errno is set to the error code. The following
1020 * error codes can be expected:
1021 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
1022 * EILSEQ The input string cannot be represented as a Unicode
1023 * string according to the current locale.
1024 * ENAMETOOLONG Destination buffer is too small for input string.
1025 * ENOMEM Not enough memory to allocate destination buffer.
1026 */
ntfs_mbstoucs(const char * ins,ntfschar ** outs)1027 int ntfs_mbstoucs(const char *ins, ntfschar **outs)
1028 {
1029 #ifdef MB_CUR_MAX
1030 ntfschar *ucs;
1031 const char *s;
1032 wchar_t wc;
1033 int i, o, cnt, ins_len, ucs_len, ins_size;
1034 #ifdef HAVE_MBSINIT
1035 mbstate_t mbstate;
1036 #endif
1037 #endif /* MB_CUR_MAX */
1038
1039 if (!ins || !outs) {
1040 errno = EINVAL;
1041 return -1;
1042 }
1043
1044 if (use_utf8)
1045 return ntfs_utf8_to_utf16(ins, outs);
1046
1047 #ifdef MB_CUR_MAX
1048 /* Determine the size of the multi-byte string in bytes. */
1049 ins_size = strlen(ins);
1050 /* Determine the length of the multi-byte string. */
1051 s = ins;
1052 #if defined(HAVE_MBSINIT)
1053 memset(&mbstate, 0, sizeof(mbstate));
1054 ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
1055 #ifdef __CYGWIN32__
1056 if (!ins_len && *ins) {
1057 /* Older Cygwin had broken mbsrtowcs() implementation. */
1058 ins_len = strlen(ins);
1059 }
1060 #endif
1061 #elif !defined(DJGPP)
1062 ins_len = mbstowcs(NULL, s, 0);
1063 #else
1064 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
1065 ins_len = strlen(ins);
1066 #endif
1067 if (ins_len == -1)
1068 return ins_len;
1069 #ifdef HAVE_MBSINIT
1070 if ((s != ins) || !mbsinit(&mbstate)) {
1071 #else
1072 if (s != ins) {
1073 #endif
1074 errno = EILSEQ;
1075 return -1;
1076 }
1077 /* Add the NULL terminator. */
1078 ins_len++;
1079 ucs_len = ins_len;
1080 ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
1081 if (!ucs)
1082 return -1;
1083 #ifdef HAVE_MBSINIT
1084 memset(&mbstate, 0, sizeof(mbstate));
1085 #else
1086 mbtowc(NULL, NULL, 0);
1087 #endif
1088 for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
1089 /* Reallocate memory if necessary. */
1090 if (o >= ucs_len) {
1091 ntfschar *tc;
1092 ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
1093 tc = realloc(ucs, ucs_len);
1094 if (!tc)
1095 goto err_out;
1096 ucs = tc;
1097 ucs_len /= sizeof(ntfschar);
1098 }
1099 /* Convert the multibyte character to a wide character. */
1100 #ifdef HAVE_MBSINIT
1101 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
1102 #else
1103 cnt = mbtowc(&wc, ins + i, ins_size - i);
1104 #endif
1105 if (!cnt)
1106 break;
1107 if (cnt == -1)
1108 goto err_out;
1109 if (cnt < -1) {
1110 ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1111 errno = EINVAL;
1112 goto err_out;
1113 }
1114 /* Make sure we are not overflowing the NTFS Unicode set. */
1115 if ((unsigned long)wc >= (unsigned long)(1 <<
1116 (8 * sizeof(ntfschar)))) {
1117 errno = EILSEQ;
1118 goto err_out;
1119 }
1120 /* Convert the CPU wide character to a LE Unicode character. */
1121 ucs[o] = cpu_to_le16(wc);
1122 }
1123 #ifdef HAVE_MBSINIT
1124 /* Make sure we are back in the initial state. */
1125 if (!mbsinit(&mbstate)) {
1126 ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1127 errno = EILSEQ;
1128 goto err_out;
1129 }
1130 #endif
1131 /* Now write the NULL character. */
1132 ucs[o] = const_cpu_to_le16(L'\0');
1133 *outs = ucs;
1134 return o;
1135 err_out:
1136 free(ucs);
1137 #else /* MB_CUR_MAX */
1138 errno = EILSEQ;
1139 #endif /* MB_CUR_MAX */
1140 return -1;
1141 }
1142
1143 /*
1144 * Turn a UTF8 name uppercase
1145 *
1146 * Returns an allocated uppercase name which has to be freed by caller
1147 * or NULL if there is an error (described by errno)
1148 */
1149
1150 char *ntfs_uppercase_mbs(const char *low,
1151 const ntfschar *upcase, u32 upcase_size)
1152 {
1153 int size;
1154 char *upp;
1155 u32 wc;
1156 int n;
1157 const char *s;
1158 char *t;
1159
1160 size = strlen(low);
1161 upp = (char*)ntfs_malloc(3*size + 1);
1162 if (upp) {
1163 s = low;
1164 t = upp;
1165 do {
1166 n = utf8_to_unicode(&wc, s);
1167 if (n > 0) {
1168 if (wc < upcase_size)
1169 wc = le16_to_cpu(upcase[wc]);
1170 if (wc < 0x80)
1171 *t++ = wc;
1172 else if (wc < 0x800) {
1173 *t++ = (0xc0 | ((wc >> 6) & 0x3f));
1174 *t++ = 0x80 | (wc & 0x3f);
1175 } else if (wc < 0x10000) {
1176 *t++ = 0xe0 | (wc >> 12);
1177 *t++ = 0x80 | ((wc >> 6) & 0x3f);
1178 *t++ = 0x80 | (wc & 0x3f);
1179 } else {
1180 *t++ = 0xf0 | ((wc >> 18) & 7);
1181 *t++ = 0x80 | ((wc >> 12) & 63);
1182 *t++ = 0x80 | ((wc >> 6) & 0x3f);
1183 *t++ = 0x80 | (wc & 0x3f);
1184 }
1185 s += n;
1186 }
1187 } while (n > 0);
1188 if (n < 0) {
1189 free(upp);
1190 upp = (char*)NULL;
1191 errno = EILSEQ;
1192 } else {
1193 *t = 0;
1194 }
1195 }
1196 return (upp);
1197 }
1198
1199 /**
1200 * ntfs_upcase_table_build - build the default upcase table for NTFS
1201 * @uc: destination buffer where to store the built table
1202 * @uc_len: size of destination buffer in bytes
1203 *
1204 * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1205 * stores it in the caller supplied buffer @uc of size @uc_len.
1206 *
1207 * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1208 */
1209 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1210 {
1211 struct NEWUPPERCASE {
1212 unsigned short first;
1213 unsigned short last;
1214 short diff;
1215 unsigned char step;
1216 unsigned char osmajor;
1217 unsigned char osminor;
1218 } ;
1219
1220 /*
1221 * This is the table as defined by Windows XP
1222 */
1223 static int uc_run_table[][3] = { /* Start, End, Add */
1224 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
1225 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
1226 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1227 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
1228 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
1229 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
1230 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
1231 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
1232 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
1233 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
1234 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
1235 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
1236 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
1237 {0}
1238 };
1239 static int uc_dup_table[][2] = { /* Start, End */
1240 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1241 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1242 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1243 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1244 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1245 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1246 {0}
1247 };
1248 static int uc_byte_table[][2] = { /* Offset, Value */
1249 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1250 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1251 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1252 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1253 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1254 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1255 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1256 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1257 {0}
1258 };
1259
1260 /*
1261 * Changes which were applied to later Windows versions
1262 *
1263 * md5 for $UpCase from Winxp : 6fa3db2468275286210751e869d36373
1264 * Vista : 2f03b5a69d486ff3864cecbd07f24440
1265 * Win8 : 7ff498a44e45e77374cc7c962b1b92f2
1266 */
1267 static const struct NEWUPPERCASE newuppercase[] = {
1268 /* from Windows 6.0 (Vista) */
1269 { 0x37b, 0x37d, 0x82, 1, 6, 0 },
1270 { 0x1f80, 0x1f87, 0x8, 1, 6, 0 },
1271 { 0x1f90, 0x1f97, 0x8, 1, 6, 0 },
1272 { 0x1fa0, 0x1fa7, 0x8, 1, 6, 0 },
1273 { 0x2c30, 0x2c5e, -0x30, 1, 6, 0 },
1274 { 0x2d00, 0x2d25, -0x1c60, 1, 6, 0 },
1275 { 0x2c68, 0x2c6c, -0x1, 2, 6, 0 },
1276 { 0x219, 0x21f, -0x1, 2, 6, 0 },
1277 { 0x223, 0x233, -0x1, 2, 6, 0 },
1278 { 0x247, 0x24f, -0x1, 2, 6, 0 },
1279 { 0x3d9, 0x3e1, -0x1, 2, 6, 0 },
1280 { 0x48b, 0x48f, -0x1, 2, 6, 0 },
1281 { 0x4fb, 0x513, -0x1, 2, 6, 0 },
1282 { 0x2c81, 0x2ce3, -0x1, 2, 6, 0 },
1283 { 0x3f8, 0x3fb, -0x1, 3, 6, 0 },
1284 { 0x4c6, 0x4ce, -0x1, 4, 6, 0 },
1285 { 0x23c, 0x242, -0x1, 6, 6, 0 },
1286 { 0x4ed, 0x4f7, -0x1, 10, 6, 0 },
1287 { 0x450, 0x45d, -0x50, 13, 6, 0 },
1288 { 0x2c61, 0x2c76, -0x1, 21, 6, 0 },
1289 { 0x1fcc, 0x1ffc, -0x9, 48, 6, 0 },
1290 { 0x180, 0x180, 0xc3, 1, 6, 0 },
1291 { 0x195, 0x195, 0x61, 1, 6, 0 },
1292 { 0x19a, 0x19a, 0xa3, 1, 6, 0 },
1293 { 0x19e, 0x19e, 0x82, 1, 6, 0 },
1294 { 0x1bf, 0x1bf, 0x38, 1, 6, 0 },
1295 { 0x1f9, 0x1f9, -0x1, 1, 6, 0 },
1296 { 0x23a, 0x23a, 0x2a2b, 1, 6, 0 },
1297 { 0x23e, 0x23e, 0x2a28, 1, 6, 0 },
1298 { 0x26b, 0x26b, 0x29f7, 1, 6, 0 },
1299 { 0x27d, 0x27d, 0x29e7, 1, 6, 0 },
1300 { 0x280, 0x280, -0xda, 1, 6, 0 },
1301 { 0x289, 0x289, -0x45, 1, 6, 0 },
1302 { 0x28c, 0x28c, -0x47, 1, 6, 0 },
1303 { 0x3f2, 0x3f2, 0x7, 1, 6, 0 },
1304 { 0x4cf, 0x4cf, -0xf, 1, 6, 0 },
1305 { 0x1d7d, 0x1d7d, 0xee6, 1, 6, 0 },
1306 { 0x1fb3, 0x1fb3, 0x9, 1, 6, 0 },
1307 { 0x214e, 0x214e, -0x1c, 1, 6, 0 },
1308 { 0x2184, 0x2184, -0x1, 1, 6, 0 },
1309 /* from Windows 6.1 (Win7) */
1310 { 0x23a, 0x23e, 0x0, 4, 6, 1 },
1311 { 0x250, 0x250, 0x2a1f, 2, 6, 1 },
1312 { 0x251, 0x251, 0x2a1c, 2, 6, 1 },
1313 { 0x271, 0x271, 0x29fd, 2, 6, 1 },
1314 { 0x371, 0x373, -0x1, 2, 6, 1 },
1315 { 0x377, 0x377, -0x1, 2, 6, 1 },
1316 { 0x3c2, 0x3c2, 0x0, 2, 6, 1 },
1317 { 0x3d7, 0x3d7, -0x8, 2, 6, 1 },
1318 { 0x515, 0x523, -0x1, 2, 6, 1 },
1319 /* below, -0x75fc stands for 0x8a04 and truncation */
1320 { 0x1d79, 0x1d79, -0x75fc, 2, 6, 1 },
1321 { 0x1efb, 0x1eff, -0x1, 2, 6, 1 },
1322 { 0x1fc3, 0x1ff3, 0x9, 48, 6, 1 },
1323 { 0x1fcc, 0x1ffc, 0x0, 48, 6, 1 },
1324 { 0x2c65, 0x2c65, -0x2a2b, 2, 6, 1 },
1325 { 0x2c66, 0x2c66, -0x2a28, 2, 6, 1 },
1326 { 0x2c73, 0x2c73, -0x1, 2, 6, 1 },
1327 { 0xa641, 0xa65f, -0x1, 2, 6, 1 },
1328 { 0xa663, 0xa66d, -0x1, 2, 6, 1 },
1329 { 0xa681, 0xa697, -0x1, 2, 6, 1 },
1330 { 0xa723, 0xa72f, -0x1, 2, 6, 1 },
1331 { 0xa733, 0xa76f, -0x1, 2, 6, 1 },
1332 { 0xa77a, 0xa77c, -0x1, 2, 6, 1 },
1333 { 0xa77f, 0xa787, -0x1, 2, 6, 1 },
1334 { 0xa78c, 0xa78c, -0x1, 2, 6, 1 },
1335 /* end mark */
1336 { 0 }
1337 } ;
1338
1339 int i, r;
1340 int k, off;
1341 const struct NEWUPPERCASE *puc;
1342
1343 memset((char*)uc, 0, uc_len);
1344 uc_len >>= 1;
1345 if (uc_len > 65536)
1346 uc_len = 65536;
1347 for (i = 0; (u32)i < uc_len; i++)
1348 uc[i] = cpu_to_le16(i);
1349 for (r = 0; uc_run_table[r][0]; r++) {
1350 off = uc_run_table[r][2];
1351 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1352 uc[i] = cpu_to_le16(i + off);
1353 }
1354 for (r = 0; uc_dup_table[r][0]; r++)
1355 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1356 uc[i + 1] = cpu_to_le16(i);
1357 for (r = 0; uc_byte_table[r][0]; r++) {
1358 k = uc_byte_table[r][1];
1359 uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1360 }
1361 for (r=0; newuppercase[r].first; r++) {
1362 puc = &newuppercase[r];
1363 if ((puc->osmajor < UPCASE_MAJOR)
1364 || ((puc->osmajor == UPCASE_MAJOR)
1365 && (puc->osminor <= UPCASE_MINOR))) {
1366 off = puc->diff;
1367 for (i = puc->first; i <= puc->last; i += puc->step)
1368 uc[i] = cpu_to_le16(i + off);
1369 }
1370 }
1371 }
1372
1373 /*
1374 * Allocate and build the default upcase table
1375 *
1376 * Returns the number of entries
1377 * 0 if failed
1378 */
1379
1380 #define UPCASE_LEN 65536 /* default number of entries in upcase */
1381
1382 u32 ntfs_upcase_build_default(ntfschar **upcase)
1383 {
1384 u32 upcase_len = 0;
1385
1386 *upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2);
1387 if (*upcase) {
1388 ntfs_upcase_table_build(*upcase, UPCASE_LEN*2);
1389 upcase_len = UPCASE_LEN;
1390 }
1391 return (upcase_len);
1392 }
1393
1394 /*
1395 * Build a table for converting to lower case
1396 *
1397 * This is only meaningful when there is a single lower case
1398 * character leading to an upper case one, and currently the
1399 * only exception is the greek letter sigma which has a single
1400 * upper case glyph (code U+03A3), but two lower case glyphs
1401 * (code U+03C3 and U+03C2, the latter to be used at the end
1402 * of a word). In the following implementation the upper case
1403 * sigma will be lowercased as U+03C3.
1404 */
1405
1406 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt)
1407 {
1408 ntfschar *lc;
1409 u32 upp;
1410 u32 i;
1411
1412 lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar));
1413 if (lc) {
1414 for (i=0; i<uc_cnt; i++)
1415 lc[i] = cpu_to_le16(i);
1416 for (i=0; i<uc_cnt; i++) {
1417 upp = le16_to_cpu(uc[i]);
1418 if ((upp != i) && (upp < uc_cnt))
1419 lc[upp] = cpu_to_le16(i);
1420 }
1421 } else
1422 ntfs_log_error("Could not build the locase table\n");
1423 return (lc);
1424 }
1425
1426 /**
1427 * ntfs_str2ucs - convert a string to a valid NTFS file name
1428 * @s: input string
1429 * @len: length of output buffer in Unicode characters
1430 *
1431 * Convert the input @s string into the corresponding little endian,
1432 * 2-byte Unicode string. The length of the converted string is less
1433 * or equal to the maximum length allowed by the NTFS format (255).
1434 *
1435 * If @s is NULL then return AT_UNNAMED.
1436 *
1437 * On success the function returns the Unicode string in an allocated
1438 * buffer and the caller is responsible to free it when it's not needed
1439 * anymore.
1440 *
1441 * On error NULL is returned and errno is set to the error code.
1442 */
1443 ntfschar *ntfs_str2ucs(const char *s, int *len)
1444 {
1445 ntfschar *ucs = NULL;
1446
1447 if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1448 ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1449 return NULL;
1450 }
1451 if (*len > NTFS_MAX_NAME_LEN) {
1452 free(ucs);
1453 errno = ENAMETOOLONG;
1454 return NULL;
1455 }
1456 if (!ucs || !*len) {
1457 ucs = AT_UNNAMED;
1458 *len = 0;
1459 }
1460 return ucs;
1461 }
1462
1463 /**
1464 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1465 * @ucs input string to be freed
1466 *
1467 * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1468 *
1469 * Return value: none.
1470 */
1471 void ntfs_ucsfree(ntfschar *ucs)
1472 {
1473 if (ucs && (ucs != AT_UNNAMED))
1474 free(ucs);
1475 }
1476
1477 /*
1478 * Check whether a name contains no chars forbidden
1479 * for DOS or Win32 use
1480 *
1481 * If @strict is TRUE, then trailing dots and spaces are forbidden.
1482 * These names are technically allowed in the Win32 namespace, but
1483 * they can be problematic. See comment for FILE_NAME_WIN32.
1484 *
1485 * If there is a bad char, errno is set to EINVAL
1486 */
1487
1488 BOOL ntfs_forbidden_chars(const ntfschar *name, int len, BOOL strict)
1489 {
1490 BOOL forbidden;
1491 int ch;
1492 int i;
1493 static const u32 mainset = (1L << ('\"' - 0x20))
1494 | (1L << ('*' - 0x20))
1495 | (1L << ('/' - 0x20))
1496 | (1L << (':' - 0x20))
1497 | (1L << ('<' - 0x20))
1498 | (1L << ('>' - 0x20))
1499 | (1L << ('?' - 0x20));
1500
1501 forbidden = (len == 0) ||
1502 (strict && (name[len-1] == const_cpu_to_le16(' ') ||
1503 name[len-1] == const_cpu_to_le16('.')));
1504 for (i=0; i<len; i++) {
1505 ch = le16_to_cpu(name[i]);
1506 if ((ch < 0x20)
1507 || ((ch < 0x40)
1508 && ((1L << (ch - 0x20)) & mainset))
1509 || (ch == '\\')
1510 || (ch == '|'))
1511 forbidden = TRUE;
1512 }
1513 if (forbidden)
1514 errno = EINVAL;
1515 return (forbidden);
1516 }
1517
1518 /*
1519 * Check whether a name contains no forbidden chars and
1520 * is not a reserved name for DOS or Win32 use
1521 *
1522 * The reserved names are CON, PRN, AUX, NUL, COM1..COM9, LPT1..LPT9
1523 * with no suffix or any suffix.
1524 *
1525 * If @strict is TRUE, then trailing dots and spaces are forbidden.
1526 * These names are technically allowed in the Win32 namespace, but
1527 * they can be problematic. See comment for FILE_NAME_WIN32.
1528 *
1529 * If the name is forbidden, errno is set to EINVAL
1530 */
1531
1532 BOOL ntfs_forbidden_names(ntfs_volume *vol, const ntfschar *name, int len,
1533 BOOL strict)
1534 {
1535 BOOL forbidden;
1536 int h;
1537 static const ntfschar dot = const_cpu_to_le16('.');
1538 static const ntfschar con[] = { const_cpu_to_le16('c'),
1539 const_cpu_to_le16('o'), const_cpu_to_le16('n') };
1540 static const ntfschar prn[] = { const_cpu_to_le16('p'),
1541 const_cpu_to_le16('r'), const_cpu_to_le16('n') };
1542 static const ntfschar aux[] = { const_cpu_to_le16('a'),
1543 const_cpu_to_le16('u'), const_cpu_to_le16('x') };
1544 static const ntfschar nul[] = { const_cpu_to_le16('n'),
1545 const_cpu_to_le16('u'), const_cpu_to_le16('l') };
1546 static const ntfschar com[] = { const_cpu_to_le16('c'),
1547 const_cpu_to_le16('o'), const_cpu_to_le16('m') };
1548 static const ntfschar lpt[] = { const_cpu_to_le16('l'),
1549 const_cpu_to_le16('p'), const_cpu_to_le16('t') };
1550
1551 forbidden = ntfs_forbidden_chars(name, len, strict);
1552 if (!forbidden && (len >= 3)) {
1553 /*
1554 * Rough hash check to tell whether the first couple of chars
1555 * may be one of CO PR AU NU LP or lowercase variants.
1556 */
1557 h = ((le16_to_cpu(name[0]) & 31)*48)
1558 ^ ((le16_to_cpu(name[1]) & 31)*165);
1559 if ((h % 23) == 17) {
1560 /* do a full check, depending on the third char */
1561 switch (le16_to_cpu(name[2]) & ~0x20) {
1562 case 'N' :
1563 if (((len == 3) || (name[3] == dot))
1564 && (!ntfs_ucsncasecmp(name, con, 3,
1565 vol->upcase, vol->upcase_len)
1566 || !ntfs_ucsncasecmp(name, prn, 3,
1567 vol->upcase, vol->upcase_len)))
1568 forbidden = TRUE;
1569 break;
1570 case 'X' :
1571 if (((len == 3) || (name[3] == dot))
1572 && !ntfs_ucsncasecmp(name, aux, 3,
1573 vol->upcase, vol->upcase_len))
1574 forbidden = TRUE;
1575 break;
1576 case 'L' :
1577 if (((len == 3) || (name[3] == dot))
1578 && !ntfs_ucsncasecmp(name, nul, 3,
1579 vol->upcase, vol->upcase_len))
1580 forbidden = TRUE;
1581 break;
1582 case 'M' :
1583 if ((len > 3)
1584 && (le16_to_cpu(name[3]) >= '1')
1585 && (le16_to_cpu(name[3]) <= '9')
1586 && ((len == 4) || (name[4] == dot))
1587 && !ntfs_ucsncasecmp(name, com, 3,
1588 vol->upcase, vol->upcase_len))
1589 forbidden = TRUE;
1590 break;
1591 case 'T' :
1592 if ((len > 3)
1593 && (le16_to_cpu(name[3]) >= '1')
1594 && (le16_to_cpu(name[3]) <= '9')
1595 && ((len == 4) || (name[4] == dot))
1596 && !ntfs_ucsncasecmp(name, lpt, 3,
1597 vol->upcase, vol->upcase_len))
1598 forbidden = TRUE;
1599 break;
1600 }
1601 }
1602 }
1603
1604 if (forbidden)
1605 errno = EINVAL;
1606 return (forbidden);
1607 }
1608
1609 /*
1610 * Check whether the same name can be used as a DOS and
1611 * a Win32 name
1612 *
1613 * The names must be the same, or the short name the uppercase
1614 * variant of the long name
1615 */
1616
1617 BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1618 const ntfschar *shortname, int shortlen,
1619 const ntfschar *longname, int longlen)
1620 {
1621 BOOL collapsible;
1622 unsigned int ch;
1623 unsigned int cs;
1624 int i;
1625
1626 collapsible = shortlen == longlen;
1627 for (i=0; collapsible && (i<shortlen); i++) {
1628 ch = le16_to_cpu(longname[i]);
1629 cs = le16_to_cpu(shortname[i]);
1630 if ((cs != ch)
1631 && ((ch >= vol->upcase_len)
1632 || (cs >= vol->upcase_len)
1633 || (vol->upcase[cs] != vol->upcase[ch])))
1634 collapsible = FALSE;
1635 }
1636 return (collapsible);
1637 }
1638
1639 /*
1640 * Define the character encoding to be used.
1641 * Use UTF-8 unless specified otherwise.
1642 */
1643
1644 int ntfs_set_char_encoding(const char *locale)
1645 {
1646 use_utf8 = 0;
1647 if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1648 || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1649 use_utf8 = 1;
1650 else
1651 if (setlocale(LC_ALL, locale))
1652 use_utf8 = 0;
1653 else {
1654 ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1655 use_utf8 = 1;
1656 }
1657 return 0; /* always successful */
1658 }
1659
1660 #if defined(__APPLE__) || defined(__DARWIN__)
1661
1662 int ntfs_macosx_normalize_filenames(int normalize) {
1663 #ifdef ENABLE_NFCONV
1664 if (normalize == 0 || normalize == 1) {
1665 nfconvert_utf8 = normalize;
1666 return 0;
1667 }
1668 else {
1669 return -1;
1670 }
1671 #else
1672 return -1;
1673 #endif /* ENABLE_NFCONV */
1674 }
1675
1676 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1677 int composed)
1678 {
1679 #ifdef ENABLE_NFCONV
1680 /* For this code to compile, the CoreFoundation framework must be fed to
1681 * the linker. */
1682 CFStringRef cfSourceString;
1683 CFMutableStringRef cfMutableString;
1684 CFRange rangeToProcess;
1685 CFIndex requiredBufferLength;
1686 char *result = NULL;
1687 int resultLength = -1;
1688
1689 /* Convert the UTF-8 string to a CFString. */
1690 cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault,
1691 utf8_string, kCFStringEncodingUTF8);
1692 if (cfSourceString == NULL) {
1693 ntfs_log_error("CFStringCreateWithCString failed!\n");
1694 return -2;
1695 }
1696
1697 /* Create a mutable string from cfSourceString that we are free to
1698 * modify. */
1699 cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0,
1700 cfSourceString);
1701 CFRelease(cfSourceString); /* End-of-life. */
1702 if (cfMutableString == NULL) {
1703 ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1704 return -3;
1705 }
1706
1707 /* Normalize the mutable string to the desired normalization form. */
1708 CFStringNormalize(cfMutableString, (composed != 0 ?
1709 kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1710
1711 /* Store the resulting string in a '\0'-terminated UTF-8 encoded char*
1712 * buffer. */
1713 rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1714 if (CFStringGetBytes(cfMutableString, rangeToProcess,
1715 kCFStringEncodingUTF8, 0, false, NULL, 0,
1716 &requiredBufferLength) > 0)
1717 {
1718 resultLength = sizeof(char) * (requiredBufferLength + 1);
1719 result = ntfs_calloc(resultLength);
1720
1721 if (result != NULL) {
1722 if (CFStringGetBytes(cfMutableString, rangeToProcess,
1723 kCFStringEncodingUTF8, 0, false,
1724 (UInt8*) result, resultLength - 1,
1725 &requiredBufferLength) <= 0)
1726 {
1727 ntfs_log_error("Could not perform UTF-8 "
1728 "conversion of normalized "
1729 "CFMutableString.\n");
1730 free(result);
1731 result = NULL;
1732 }
1733 }
1734 else {
1735 ntfs_log_error("Could not perform a ntfs_calloc of %d "
1736 "bytes for char *result.\n", resultLength);
1737 }
1738 }
1739 else {
1740 ntfs_log_error("Could not perform check for required length of "
1741 "UTF-8 conversion of normalized CFMutableString.\n");
1742 }
1743
1744 CFRelease(cfMutableString);
1745
1746 if (result != NULL) {
1747 *target = result;
1748 return resultLength - 1;
1749 }
1750 else {
1751 return -1;
1752 }
1753 #else
1754 return -1;
1755 #endif /* ENABLE_NFCONV */
1756 }
1757 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
1758