scrub/unicrash.c - pub/scm/linux/kernel/git/djwong/xfsprogs-dev - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
  * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include "xfs_arch.h"
 #include "list.h"
 #include <stdint.h>
 #include <stdlib.h>
 #include <dirent.h>
 #include <sys/types.h>
 #include <sys/statvfs.h>
 #include <strings.h>
 #include <unicode/uclean.h>
 #include <unicode/ustring.h>
 #include <unicode/unorm2.h>
 #include <unicode/uspoof.h>
 #include "libfrog/paths.h"
 #include "xfs_scrub.h"
 #include "common.h"
 #include "descr.h"
 #include "unicrash.h"

 /*
  * Detect Unicode confusable names in directories and attributes.
  *
  * Record all the name->ino mappings in a directory/xattr, with a twist!  The
  * twist is to record the Unicode skeleton and normalized version of every
  * name we see so that we can check for a name space (directory, extended
  * attribute set) containing names containing malicious characters or that
  * could be confused for one another.  These entries are at best a sign of
  * Unicode mishandling, or some sort of weird name substitution attack if the
  * entries do not point to the same inode.  Warn if we see multiple dirents
  * that do not all point to the same inode.
  *
  * For extended attributes we perform the same collision checks on the
  * attribute, though any collision is enough to trigger a warning.
  *
  * We avoid flagging these problems as errors because XFS treats names as a
  * sequence of arbitrary nonzero bytes.  While a Unicode collision is not
  * technically a filesystem corruption, we ought to say something if there's a
  * possibility for misleading a user.  Unquestionably bad things (direction
  * overrides, control characters, names that normalize to the same string)
  * produce warnings, whereas potentially confusable names produce
  * informational messages.
  *
  * The skeleton algorithm is detailed in section 4 ("Confusable Detection") of
  * the Unicode technical standard #39.  First we normalize the name, then we
  * substitute code points according to the confusable code point table, then
  * normalize again.
  *
  * We take the extra step of removing non-identifier code points such as
  * formatting characters, control characters, zero width characters, etc.
  * from the skeleton so that we can complain about names that are confusable
  * due to invisible control characters.
  *
  * In other words, skel = remove_invisible(nfd(remap_confusables(nfd(name)))).
  */

 typedef uint16_t __bitwise	badname_t;

 struct name_entry {
 	struct name_entry	*next;

 	/* NFKC normalized name */
 	UChar			*normstr;

 	/* Unicode skeletonized name */
 	UChar			*skelstr;

 	/* Lengths for normstr and skelstr */
 	int32_t			normstrlen;
 	int32_t			skelstrlen;

 	xfs_ino_t		ino;

 	/* Everything that we don't like about this name. */
 	badname_t		badflags;

 	/* Raw dirent name */
 	uint16_t		namelen;
 	char			name[0];
 };
 #define NAME_ENTRY_SZ(nl)	(sizeof(struct name_entry) + 1 + \
 				 (nl * sizeof(uint8_t)))

 struct unicrash {
 	struct scrub_ctx	*ctx;
 	USpoofChecker		*spoof;
 	const UNormalizer2	*nfkc;
 	const UNormalizer2	*nfc;
 	bool			compare_ino;
 	bool			is_only_root_writeable;
 	size_t			nr_buckets;
 	struct name_entry	*buckets[0];
 };
 #define UNICRASH_SZ(nr)		(sizeof(struct unicrash) + \
 				 (nr * sizeof(struct name_entry *)))

 /* Things to complain about in Unicode naming. */

 /* Everything is ok */
 #define UNICRASH_OK		((__force badname_t)0)

 /*
  * Multiple names resolve to the same normalized string and therefore render
  * identically.
  */
 #define UNICRASH_NOT_UNIQUE	((__force badname_t)(1U << 0))

 /* Name contains directional overrides. */
 #define UNICRASH_BIDI_OVERRIDE	((__force badname_t)(1U << 1))

 /* Name mixes left-to-right and right-to-left characters. */
 #define UNICRASH_BIDI_MIXED	((__force badname_t)(1U << 2))

 /* Control characters in name. */
 #define UNICRASH_CONTROL_CHAR	((__force badname_t)(1U << 3))

 /* Invisible characters.  Only a problem if we have collisions. */
 #define UNICRASH_INVISIBLE	((__force badname_t)(1U << 4))

 /* Multiple names resolve to the same skeleton string. */
 #define UNICRASH_CONFUSABLE	((__force badname_t)(1U << 5))

 /* Possible phony file extension. */
 #define UNICRASH_PHONY_EXTENSION ((__force badname_t)(1U << 6))

 /* FULL STOP (aka period), 0x2E */
 #define UCHAR_PERIOD		((UChar32)'.')

 /*
  * We only care about validating utf8 collisions if the underlying
  * system configuration says we're using utf8.  If the language
  * specifier string used to output messages has ".UTF-8" somewhere in
  * its name, then we conclude utf8 is in use.  Otherwise, no checking is
  * performed.
  *
  * Most modern Linux systems default to utf8, so the only time this
  * check will return false is if the administrator configured things
  * this way or if things are so messed up there is no locale data at
  * all.
  */
 #define UTF8_STR		".UTF-8"
 #define UTF8_STRLEN		(sizeof(UTF8_STR) - 1)
 static bool
 is_utf8_locale(void)
 {
 	const char		*msg_locale;
 	static int		answer = -1;

 	if (answer != -1)
 		return answer;

 	msg_locale = setlocale(LC_MESSAGES, NULL);
 	if (msg_locale == NULL)
 		return false;

 	if (strstr(msg_locale, UTF8_STR) != NULL)
 		answer = 1;
 	else
 		answer = 0;
 	return answer;
 }

 /*
  * Remove control/formatting characters from this string and return its new
  * length.  UChar32 is required for U16_NEXT, despite the name.
  */
 static int32_t
 remove_ignorable(
 	UChar		*ustr,
 	int32_t		ustrlen)
 {
 	UChar32		uchr;
 	int32_t		src, dest;

 	for (src = 0, dest = 0; src < ustrlen; dest = src) {
 		U16_NEXT(ustr, src, ustrlen, uchr);
 		if (!u_isIDIgnorable(uchr))
 			continue;
 		memmove(&ustr[dest], &ustr[src],
 				(ustrlen - src + 1) * sizeof(UChar));
 		ustrlen -= (src - dest);
 		src = dest;
 	}

 	return dest;
 }

 /*
  * Certain unicode codepoints are formatting hints that are not themselves
  * supposed to be rendered by a display system.  These codepoints can be
  * encoded in file names to try to confuse users.
  *
  * Download https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt and
  * $ grep -E '(zero width|invisible|joiner|application)' -i UnicodeData.txt
  */
 static inline bool is_nonrendering(UChar32 uchr)
 {
 	switch (uchr) {
 	case 0x034F:	/* combining grapheme joiner */
 	case 0x200B:	/* zero width space */
 	case 0x200C:	/* zero width non-joiner */
 	case 0x200D:	/* zero width joiner */
 	case 0x2060:	/* word joiner */
 	case 0x2061:	/* function application */
 	case 0x2062:	/* invisible times (multiply) */
 	case 0x2063:	/* invisible separator (comma) */
 	case 0x2064:	/* invisible plus (addition) */
 	case 0x2D7F:	/* tifinagh consonant joiner */
 	case 0xFEFF:	/* zero width non breaking space */
 		return true;
 	}

 	return false;
 }

 /*
  * Decide if this unicode codepoint looks similar enough to a period (".")
  * to fool users into thinking that any subsequent alphanumeric sequence is
  * the file extension.  Most of the fullstop characters do not do this.
  *
  * $ grep -i 'full stop' UnicodeData.txt
  */
 static inline bool is_fullstop_lookalike(UChar32 uchr)
 {
 	switch (uchr) {
 	case 0x0701:	/* syriac supralinear full stop */
 	case 0x0702:	/* syriac sublinear full stop */
 	case 0x2024:	/* one dot leader */
 	case 0xA4F8:	/* lisu letter tone mya ti */
 	case 0xFE52:	/* small full stop */
 	case 0xFF61:	/* haflwidth ideographic full stop */
 	case 0xFF0E:	/* fullwidth full stop */
 		return true;
 	}

 	return false;
 }

 /* How many UChar do we need to fit a full UChar32 codepoint? */
 #define UCHAR_PER_UCHAR32	2

 /* Format this UChar32 into a UChar buffer. */
 static inline int32_t
 uchar32_to_uchar(
 	UChar32		uchr,
 	UChar		*buf)
 {
 	int32_t		i = 0;
 	bool		err = false;

 	U16_APPEND(buf, i, UCHAR_PER_UCHAR32, uchr, err);
 	if (err)
 		return 0;
 	return i;
 }

 /* Extract a single UChar32 code point from this UChar string. */
 static inline UChar32
 uchar_to_uchar32(
 	UChar		*buf,
 	int32_t		buflen)
 {
 	UChar32		ret;
 	int32_t		i = 0;

 	U16_NEXT(buf, i, buflen, ret);
 	return ret;
 }

 /*
  * For characters that are not themselves a full stop (0x2E), let's see if the
  * compatibility normalization (NFKC) will turn it into a full stop.  If so,
  * then this could be the start of a phony file extension.
  */
 static bool
 is_period_lookalike(
 	struct unicrash	*uc,
 	UChar32		uchr)
 {
 	UChar		uchrstr[UCHAR_PER_UCHAR32];
 	UChar		nfkcstr[UCHAR_PER_UCHAR32];
 	int32_t		uchrstrlen, nfkcstrlen;
 	UChar32		nfkc_uchr;
 	UErrorCode	uerr = U_ZERO_ERROR;

 	if (uchr == UCHAR_PERIOD)
 		return false;

 	uchrstrlen = uchar32_to_uchar(uchr, uchrstr);
 	if (!uchrstrlen)
 		return false;

 	/*
 	 * Normalize the UChar string to NFKC form, which does all the
 	 * compatibility transformations.
 	 */
 	nfkcstrlen = unorm2_normalize(uc->nfkc, uchrstr, uchrstrlen, NULL,
 			0, &uerr);
 	if (uerr == U_BUFFER_OVERFLOW_ERROR)
 		return false;

 	uerr = U_ZERO_ERROR;
 	unorm2_normalize(uc->nfkc, uchrstr, uchrstrlen, nfkcstr, nfkcstrlen,
 			&uerr);
 	if (U_FAILURE(uerr))
 		return false;

 	nfkc_uchr = uchar_to_uchar32(nfkcstr, nfkcstrlen);
 	return nfkc_uchr == UCHAR_PERIOD;
 }

 /*
  * Detect directory entry names that contain deceptive sequences that look like
  * file extensions but are not.  This we define as a sequence that begins with
  * a code point that renders like a period ("full stop" in unicode parlance)
  * but is not actually a period, followed by any number of alphanumeric code
  * points or a period, all the way to the end.
  *
  * The 3cx attack used a zip file containing an executable file named "job
  * offer․pdf".  Note that the dot mark in the extension is /not/ a period but
  * the Unicode codepoint "leader dot".  The file was also marked executable
  * inside the zip file, which meant that naïve file explorers could inflate
  * the file and restore the execute bit.  If a user double-clicked on the file,
  * the binary would open a decoy pdf while infecting the system.
  *
  * For this check, we need to normalize with canonical (and not compatibility)
  * decomposition, because compatibility mode will turn certain code points
  * (e.g. one dot leader, 0x2024) into actual periods (0x2e).  The NFC
  * composition is not needed after this, so we save some memory by keeping this
  * a separate function from name_entry_examine.
  */
 static badname_t
 name_entry_phony_extension(
 	struct unicrash	*uc,
 	const UChar	*unistr,
 	int32_t		unistrlen)
 {
 	UCharIterator	uiter;
 	UChar		*nfcstr;
 	int32_t		nfcstrlen;
 	UChar32		uchr;
 	bool		maybe_phony_extension = false;
 	badname_t	ret = UNICRASH_OK;
 	UErrorCode	uerr = U_ZERO_ERROR;

 	/* Normalize with NFC. */
 	nfcstrlen = unorm2_normalize(uc->nfc, unistr, unistrlen, NULL,
 			0, &uerr);
 	if (uerr != U_BUFFER_OVERFLOW_ERROR || nfcstrlen < 0)
 		return ret;
 	uerr = U_ZERO_ERROR;
 	nfcstr = calloc(nfcstrlen + 1, sizeof(UChar));
 	if (!nfcstr)
 		return ret;
 	unorm2_normalize(uc->nfc, unistr, unistrlen, nfcstr, nfcstrlen,
 			&uerr);
 	if (U_FAILURE(uerr))
 		goto out_nfcstr;

 	/* Examine the NFC normalized string... */
 	uiter_setString(&uiter, nfcstr, nfcstrlen);
 	while ((uchr = uiter_next32(&uiter)) != U_SENTINEL) {
 		/*
 		 * If this *looks* like, but is not, a full stop (0x2E), this
 		 * could be the start of a phony file extension.
 		 */
 		if (is_period_lookalike(uc, uchr)) {
 			maybe_phony_extension = true;
 			continue;
 		}

 		if (is_fullstop_lookalike(uchr)) {
 			/*
 			 * The normalizer above should catch most of these
 			 * codepoints that look like periods, but record the
 			 * ones known to have been used in attacks.
 			 */
 			maybe_phony_extension = true;
 		} else if (uchr == UCHAR_PERIOD) {
 			/*
 			 * Due to the propensity of file explores to obscure
 			 * file extensions in the name of "user friendliness",
 			 * this classifier ignores periods.
 			 */
 		} else {
 			/*
 			 * File extensions (as far as the author knows) tend
 			 * only to use ascii alphanumerics.
 			 */
 			if (maybe_phony_extension &&
 			    !u_isalnum(uchr) && !is_nonrendering(uchr))
 				maybe_phony_extension = false;
 		}
 	}
 	if (maybe_phony_extension)
 		ret |= UNICRASH_PHONY_EXTENSION;

 out_nfcstr:
 	free(nfcstr);
 	return ret;
 }

 /*
  * Generate normalized form and skeleton of the name.  If this fails, just
  * forget everything and return false; this is an advisory checker.
  */
 static bool
 name_entry_compute_checknames(
 	struct unicrash		*uc,
 	struct name_entry	*entry)
 {
 	UChar			*normstr;
 	UChar			*unistr;
 	UChar			*skelstr;
 	int32_t			normstrlen;
 	int32_t			unistrlen;
 	int32_t			skelstrlen;
 	UErrorCode		uerr = U_ZERO_ERROR;

 	/* Convert bytestr to unistr for normalization */
 	u_strFromUTF8(NULL, 0, &unistrlen, entry->name, entry->namelen, &uerr);
 	if (uerr != U_BUFFER_OVERFLOW_ERROR || unistrlen < 0)
 		return false;
 	uerr = U_ZERO_ERROR;
 	unistr = calloc(unistrlen + 1, sizeof(UChar));
 	if (!unistr)
 		return false;
 	u_strFromUTF8(unistr, unistrlen, NULL, entry->name, entry->namelen,
 			&uerr);
 	if (U_FAILURE(uerr))
 		goto out_unistr;

 	/* Normalize the string. */
 	normstrlen = unorm2_normalize(uc->nfkc, unistr, unistrlen, NULL,
 			0, &uerr);
 	if (uerr != U_BUFFER_OVERFLOW_ERROR || normstrlen < 0)
 		goto out_unistr;
 	uerr = U_ZERO_ERROR;
 	normstr = calloc(normstrlen + 1, sizeof(UChar));
 	if (!normstr)
 		goto out_unistr;
 	unorm2_normalize(uc->nfkc, unistr, unistrlen, normstr, normstrlen,
 			&uerr);
 	if (U_FAILURE(uerr))
 		goto out_normstr;

 	/* Compute skeleton. */
 	skelstrlen = uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, NULL,
 			0, &uerr);
 	if (uerr != U_BUFFER_OVERFLOW_ERROR || skelstrlen < 0)
 		goto out_normstr;
 	uerr = U_ZERO_ERROR;
 	skelstr = calloc(skelstrlen + 1, sizeof(UChar));
 	if (!skelstr)
 		goto out_normstr;
 	uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, skelstr, skelstrlen,
 			&uerr);
 	if (U_FAILURE(uerr))
 		goto out_skelstr;

 	skelstrlen = remove_ignorable(skelstr, skelstrlen);

 	/* Check for deceptive file extensions in directory entry names. */
 	if (entry->ino)
 		entry->badflags |= name_entry_phony_extension(uc, unistr,
 						unistrlen);

 	entry->skelstr = skelstr;
 	entry->skelstrlen = skelstrlen;
 	entry->normstr = normstr;
 	entry->normstrlen = normstrlen;
 	free(unistr);
 	return true;

 out_skelstr:
 	free(skelstr);
 out_normstr:
 	free(normstr);
 out_unistr:
 	free(unistr);
 	return false;
 }

 /*
  * Check a name for suspicious elements that have appeared in filename
  * spoofing attacks.  This includes names that mixed directions or contain
  * direction overrides control characters, both of which have appeared in
  * filename spoofing attacks.
  */
 static unsigned int
 name_entry_examine(
 	const struct name_entry	*entry)
 {
 	UCharIterator		uiter;
 	UChar32			uchr;
 	uint8_t			mask = 0;
 	unsigned int		ret = 0;

 	uiter_setString(&uiter, entry->normstr, entry->normstrlen);
 	while ((uchr = uiter_next32(&uiter)) != U_SENTINEL) {
 	/* characters are invisible */
 		if (is_nonrendering(uchr))
 			ret |= UNICRASH_INVISIBLE;

 		/* control characters */
 		if (u_iscntrl(uchr))
 			ret |= UNICRASH_CONTROL_CHAR;

 		switch (u_charDirection(uchr)) {
 		case U_LEFT_TO_RIGHT:
 			mask |= 0x01;
 			break;
 		case U_RIGHT_TO_LEFT:
 			mask |= 0x02;
 			break;
 		case U_RIGHT_TO_LEFT_OVERRIDE:
 			ret |= UNICRASH_BIDI_OVERRIDE;
 			break;
 		case U_LEFT_TO_RIGHT_OVERRIDE:
 			ret |= UNICRASH_BIDI_OVERRIDE;
 			break;
 		default:
 			break;
 		}
 	}

 	/* mixing left-to-right and right-to-left chars */
 	if (mask == 0x3)
 		ret |= UNICRASH_BIDI_MIXED;
 	return ret;
 }

 /* Create a new name entry, returns false if we could not succeed. */
 static bool
 name_entry_create(
 	struct unicrash		*uc,
 	const char		*name,
 	xfs_ino_t		ino,
 	struct name_entry	**entry)
 {
 	struct name_entry	*new_entry;
 	size_t			namelen = strlen(name);

 	/* should never happen */
 	if (namelen > UINT16_MAX) {
 		ASSERT(namelen <= UINT16_MAX);
 		return false;
 	}

 	/* Create new entry */
 	new_entry = calloc(NAME_ENTRY_SZ(namelen), 1);
 	if (!new_entry)
 		return false;
 	new_entry->next = NULL;
 	new_entry->ino = ino;
 	memcpy(new_entry->name, name, namelen);
 	new_entry->name[namelen] = 0;
 	new_entry->namelen = namelen;

 	/* Normalize/skeletonize name to find collisions. */
 	if (!name_entry_compute_checknames(uc, new_entry))
 		goto out;

 	new_entry->badflags |= name_entry_examine(new_entry);
 	*entry = new_entry;
 	return true;

 out:
 	free(new_entry);
 	return false;
 }

 /* Free a name entry */
 static void
 name_entry_free(
 	struct name_entry	*entry)
 {
 	free(entry->normstr);
 	free(entry->skelstr);
 	free(entry);
 }

 /* Adapt the dirhash function from libxfs, avoid linking with libxfs. */

 #define rol32(x, y)		(((x) << (y)) | ((x) >> (32 - (y))))

 /*
  * Implement a simple hash on a character string.
  * Rotate the hash value by 7 bits, then XOR each character in.
  * This is implemented with some source-level loop unrolling.
  */
 static xfs_dahash_t
 name_entry_hash(
 	struct name_entry	*entry)
 {
 	uint8_t			*name;
 	size_t			namelen;
 	xfs_dahash_t		hash;

 	name = (uint8_t *)entry->skelstr;
 	namelen = entry->skelstrlen * sizeof(UChar);

 	/*
 	 * Do four characters at a time as long as we can.
 	 */
 	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
 		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
 		       (name[3] << 0) ^ rol32(hash, 7 * 4);

 	/*
 	 * Now do the rest of the characters.
 	 */
 	switch (namelen) {
 	case 3:
 		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
 		       rol32(hash, 7 * 3);
 	case 2:
 		return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
 	case 1:
 		return (name[0] << 0) ^ rol32(hash, 7 * 1);
 	default: /* case 0: */
 		return hash;
 	}
 }

 /* Initialize the collision detector. */
 static int
 unicrash_init(
 	struct unicrash		**ucp,
 	struct scrub_ctx	*ctx,
 	bool			compare_ino,
 	size_t			nr_buckets,
 	bool			is_only_root_writeable)
 {
 	struct unicrash		*p;
 	UErrorCode		uerr = U_ZERO_ERROR;

 	if (!is_utf8_locale()) {
 		*ucp = NULL;
 		return 0;
 	}

 	if (nr_buckets > 65536)
 		nr_buckets = 65536;
 	else if (nr_buckets < 16)
 		nr_buckets = 16;

 	p = calloc(1, UNICRASH_SZ(nr_buckets));
 	if (!p)
 		return errno;
 	p->ctx = ctx;
 	p->nr_buckets = nr_buckets;
 	p->compare_ino = compare_ino;
 	p->nfkc = unorm2_getNFKCInstance(&uerr);
 	if (U_FAILURE(uerr))
 		goto out_free;
 	p->nfc = unorm2_getNFCInstance(&uerr);
 	if (U_FAILURE(uerr))
 		goto out_free;
 	p->spoof = uspoof_open(&uerr);
 	if (U_FAILURE(uerr))
 		goto out_free;
 	uspoof_setChecks(p->spoof, USPOOF_ALL_CHECKS, &uerr);
 	if (U_FAILURE(uerr))
 		goto out_spoof;
 	p->is_only_root_writeable = is_only_root_writeable;
 	*ucp = p;

 	return 0;
 out_spoof:
 	uspoof_close(p->spoof);
 out_free:
 	free(p);
 	return ENOMEM;
 }

 /*
  * Is this inode owned by root and not writable by others?  If so, skip
  * even the informational messages, because this was put in place by the
  * administrator.
  */
 static bool
 is_only_root_writable(
 	struct xfs_bulkstat	*bstat)
 {
 	if (bstat->bs_uid != 0 || bstat->bs_gid != 0)
 		return false;
 	return !(bstat->bs_mode & S_IWOTH);
 }

 /* Initialize the collision detector for a directory. */
 int
 unicrash_dir_init(
 	struct unicrash		**ucp,
 	struct scrub_ctx	*ctx,
 	struct xfs_bulkstat	*bstat)
 {
 	/*
 	 * Assume 64 bytes per dentry, clamp buckets between 16 and 64k.
 	 * Same general idea as dir_hash_init in xfs_repair.
 	 */
 	return unicrash_init(ucp, ctx, true, bstat->bs_size / 64,
 			is_only_root_writable(bstat));
 }

 /* Initialize the collision detector for an extended attribute. */
 int
 unicrash_xattr_init(
 	struct unicrash		**ucp,
 	struct scrub_ctx	*ctx,
 	struct xfs_bulkstat	*bstat)
 {
 	/* Assume 16 attributes per extent for lack of a better idea. */
 	return unicrash_init(ucp, ctx, false, 16 * (1 + bstat->bs_aextents),
 			is_only_root_writable(bstat));
 }

 /* Initialize the collision detector for a filesystem label. */
 int
 unicrash_fs_label_init(
 	struct unicrash		**ucp,
 	struct scrub_ctx	*ctx)
 {
 	return unicrash_init(ucp, ctx, false, 16, true);
 }

 /* Free the crash detector. */
 void
 unicrash_free(
 	struct unicrash		*uc)
 {
 	struct name_entry	*ne;
 	struct name_entry	*x;
 	size_t			i;

 	if (!uc)
 		return;

 	uspoof_close(uc->spoof);
 	for (i = 0; i < uc->nr_buckets; i++) {
 		for (ne = uc->buckets[i]; ne != NULL; ne = x) {
 			x = ne->next;
 			name_entry_free(ne);
 		}
 	}
 	free(uc);
 }

 /* Complain about Unicode problems. */
 static void
 unicrash_complain(
 	struct unicrash		*uc,
 	struct descr		*dsc,
 	const char		*what,
 	struct name_entry	*entry,
 	badname_t		badflags,
 	struct name_entry	*dup_entry)
 {
 	char			*bad1 = NULL;
 	char			*bad2 = NULL;

 	bad1 = string_escape(entry->name);
 	if (dup_entry)
 		bad2 = string_escape(dup_entry->name);

 	/*
 	 * Most filechooser UIs do not look for bidirectional overrides when
 	 * they render names.  This can result in misleading name presentation
 	 * that makes "hig<rtl>gnp.sh" render like "highs.png".
 	 */
 	if (badflags & UNICRASH_BIDI_OVERRIDE) {
 		str_warn(uc->ctx, descr_render(dsc),
 _("Unicode name \"%s\" in %s contains suspicious text direction overrides."),
 				bad1, what);
 		goto out;
 	}

 	/*
 	 * Two names that normalize to the same string will render
 	 * identically even though the filesystem considers them unique
 	 * names.  "cafe\xcc\x81" and "caf\xc3\xa9" have different byte
 	 * sequences, but they both appear as "café".
 	 */
 	if (badflags & UNICRASH_NOT_UNIQUE) {
 		str_warn(uc->ctx, descr_render(dsc),
 _("Unicode name \"%s\" in %s renders identically to \"%s\"."),
 				bad1, what, bad2);
 		goto out;
 	}

 	/*
 	 * If a name contains invisible/nonprinting characters and can be
 	 * confused with another name as a result, we should complain.
 	 * "moo<zerowidthspace>cow" and "moocow" are misleading.
 	 */
 	if ((badflags & UNICRASH_INVISIBLE) &&
 	    (badflags & UNICRASH_CONFUSABLE)) {
 		str_warn(uc->ctx, descr_render(dsc),
 _("Unicode name \"%s\" in %s could be confused with '%s' due to invisible characters."),
 				bad1, what, bad2);
 		goto out;
 	}

 	/*
 	 * Fake looking file extensions have tricked Linux users into thinking
 	 * that an executable is actually a pdf.  See Lazarus 3cx attack.
 	 */
 	if (badflags & UNICRASH_PHONY_EXTENSION) {
 		str_warn(uc->ctx, descr_render(dsc),
 _("Unicode name \"%s\" in %s contains a possibly deceptive file extension."),
 				bad1, what);
 		goto out;
 	}

 	/*
 	 * Unfiltered control characters can mess up your terminal and render
 	 * invisibly in filechooser UIs.
 	 */
 	if (badflags & UNICRASH_CONTROL_CHAR) {
 		str_warn(uc->ctx, descr_render(dsc),
 _("Unicode name \"%s\" in %s contains control characters."),
 				bad1, what);
 		goto out;
 	}

 	/*
 	 * Skip the informational messages if the inode owning the name is
 	 * only writeable by root, because those files were put there by the
 	 * sysadmin.  Also skip names less than four letters long because
 	 * there's a much higher chance of collisions with short names.
 	 */
 	if (!verbose && (uc->is_only_root_writeable || entry->namelen < 4))
 		goto out;

 	/*
 	 * It's not considered good practice (says Unicode) to mix LTR
 	 * characters with RTL characters.  The mere presence of different
 	 * bidirectional characters isn't enough to trip up software, so don't
 	 * warn about this too loudly.
 	 */
 	if (badflags & UNICRASH_BIDI_MIXED) {
 		str_info(uc->ctx, descr_render(dsc),
 _("Unicode name \"%s\" in %s mixes bidirectional characters."),
 				bad1, what);
 		goto out;
 	}

 	/*
 	 * We'll note if two names could be confusable with each other, but
 	 * whether or not the user will actually confuse them is dependent
 	 * on the rendering system and the typefaces in use.  Maybe "foo.1"
 	 * and "moo.l" look the same, maybe they do not.
 	 */
 	if (badflags & UNICRASH_CONFUSABLE) {
 		str_info(uc->ctx, descr_render(dsc),
 _("Unicode name \"%s\" in %s could be confused with \"%s\"."),
 				bad1, what, bad2);
 	}

 out:
 	free(bad1);
 	free(bad2);
 }

 /*
  * Try to add a name -> ino entry to the collision detector.  The name
  * must be skeletonized according to Unicode TR39 to detect names that
  * could be visually confused with each other.
  */
 static badname_t
 unicrash_add(
 	struct unicrash		*uc,
 	struct name_entry	**new_entryp,
 	struct name_entry	**existing_entry)
 {
 	struct name_entry	*new_entry = *new_entryp;
 	struct name_entry	*entry;
 	size_t			bucket;
 	xfs_dahash_t		hash;
 	badname_t		badflags = new_entry->badflags;

 	/* Store name in hashtable. */
 	hash = name_entry_hash(new_entry);
 	bucket = hash % uc->nr_buckets;
 	entry = uc->buckets[bucket];
 	new_entry->next = entry;
 	uc->buckets[bucket] = new_entry;

 	while (entry != NULL) {
 		/*
 		 * If we see the same byte sequence then someone's modifying
 		 * the namespace while we're scanning it.  Update the existing
 		 * entry's inode mapping and erase the new entry from existence.
 		 */
 		if (new_entry->namelen == entry->namelen &&
 		    !memcmp(new_entry->name, entry->name, entry->namelen)) {
 			entry->ino = new_entry->ino;
 			uc->buckets[bucket] = new_entry->next;
 			name_entry_free(new_entry);
 			*new_entryp = NULL;
 			return 0;
 		}

 		/* Same normalization? */
 		if (new_entry->normstrlen == entry->normstrlen &&
 		    !u_strcmp(new_entry->normstr, entry->normstr) &&
 		    (uc->compare_ino ? entry->ino != new_entry->ino : true)) {
 			badflags |= UNICRASH_NOT_UNIQUE;
 			*existing_entry = entry;
 			break;
 		}

 		/* Confusable? */
 		if (new_entry->skelstrlen == entry->skelstrlen &&
 		    !u_strcmp(new_entry->skelstr, entry->skelstr) &&
 		    (uc->compare_ino ? entry->ino != new_entry->ino : true)) {
 			badflags |= UNICRASH_CONFUSABLE;
 			*existing_entry = entry;
 			break;
 		}
 		entry = entry->next;
 	}

 	return badflags;
 }

 /* Check a name for unicode normalization problems or collisions. */
 static int
 __unicrash_check_name(
 	struct unicrash		*uc,
 	struct descr		*dsc,
 	const char		*namedescr,
 	const char		*name,
 	xfs_ino_t		ino)
 {
 	struct name_entry	*dup_entry = NULL;
 	struct name_entry	*new_entry = NULL;
 	badname_t		badflags;

 	/* If we can't create entry data, just skip it. */
 	if (!name_entry_create(uc, name, ino, &new_entry))
 		return 0;

 	badflags = unicrash_add(uc, &new_entry, &dup_entry);
 	if (new_entry && badflags != UNICRASH_OK)
 		unicrash_complain(uc, dsc, namedescr, new_entry, badflags,
 				dup_entry);

 	return 0;
 }

 /*
  * Check a directory entry for unicode normalization problems or collisions.
  * If errors occur, this function will log them and return nonzero.
  */
 int
 unicrash_check_dir_name(
 	struct unicrash		*uc,
 	struct descr		*dsc,
 	struct dirent		*dentry)
 {
 	if (!uc)
 		return 0;
 	return __unicrash_check_name(uc, dsc, _("directory"),
 			dentry->d_name, dentry->d_ino);
 }

 /*
  * Check an extended attribute name for unicode normalization problems
  * or collisions.  If errors occur, this function will log them and return
  * nonzero.
  */
 int
 unicrash_check_xattr_name(
 	struct unicrash		*uc,
 	struct descr		*dsc,
 	const char		*attrname)
 {
 	if (!uc)
 		return 0;
 	return __unicrash_check_name(uc, dsc, _("extended attribute"),
 			attrname, 0);
 }

 /*
  * Check the fs label for unicode normalization problems or misleading bits.
  * If errors occur, this function will log them and return nonzero.
  */
 int
 unicrash_check_fs_label(
 	struct unicrash		*uc,
 	struct descr		*dsc,
 	const char		*label)
 {
 	if (!uc)
 		return 0;
 	return __unicrash_check_name(uc, dsc, _("filesystem label"),
 			label, 0);
 }

 /* Dump a unicode code point and its properties. */
 static inline void dump_uchar32(UChar32 c)
 {
 	UChar		uchrstr[UCHAR_PER_UCHAR32];
 	const char	*descr;
 	char		buf[16];
 	int32_t		uchrstrlen, buflen;
 	UProperty	p;
 	UErrorCode	uerr = U_ZERO_ERROR;

 	printf("Unicode point 0x%x:", c);

 	/* Convert UChar32 to UTF8 representation. */
 	uchrstrlen = uchar32_to_uchar(c, uchrstr);
 	if (!uchrstrlen)
 		return;

 	u_strToUTF8(buf, sizeof(buf), &buflen, uchrstr, uchrstrlen, &uerr);
 	if (!U_FAILURE(uerr) && buflen > 0) {
 		int32_t	i;

 		printf(" \"");
 		for (i = 0; i < buflen; i++)
 			printf("\\x%02x", buf[i]);
 		printf("\"");
 	}
 	printf("\n");

 	for (p = 0; p < UCHAR_BINARY_LIMIT; p++) {
 		int	has;

 		descr = u_getPropertyName(p, U_LONG_PROPERTY_NAME);
 		if (!descr)
 			descr = u_getPropertyName(p, U_SHORT_PROPERTY_NAME);

 		has = u_hasBinaryProperty(c, p) ? 1 : 0;
 		if (descr) {
 			printf("  %s(%u) = %d\n", descr, p, has);
 		} else {
 			printf("  ?(%u) = %d\n", p, has);
 		}
 	}
 }

 /* Load libicu and initialize it. */
 bool
 unicrash_load(void)
 {
 	char		*dbgstr;
 	UChar32		uchr;
 	UErrorCode	uerr = U_ZERO_ERROR;

 	u_init(&uerr);
 	if (U_FAILURE(uerr))
 		return true;

 	dbgstr = getenv("XFS_SCRUB_DUMP_CHAR");
 	if (dbgstr) {
 		uchr = strtol(dbgstr, NULL, 0);
 		dump_uchar32(uchr);
 	}
 	return false;
 }

 /* Unload libicu once we're done with it. */
 void
 unicrash_unload(void)
 {
 	u_cleanup();
 }