blob: 24d4ea58211bbdd31632b228962991cf503d787a [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0+
/*
* Copyright (C) 2018 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
*/
#include "xfs.h"
#include <stdint.h>
#include <stdlib.h>
#include <dirent.h>
#include <sys/types.h>
#include <sys/statvfs.h>
#include <strings.h>
#include <unicode/uclean.h>
#include <unicode/ustring.h>
#include <unicode/unorm2.h>
#include <unicode/uspoof.h>
#include "libfrog/paths.h"
#include "xfs_scrub.h"
#include "common.h"
#include "descr.h"
#include "unicrash.h"
/*
* Detect Unicode confusable names in directories and attributes.
*
* Record all the name->ino mappings in a directory/xattr, with a twist! The
* twist is to record the Unicode skeleton and normalized version of every
* name we see so that we can check for a name space (directory, extended
* attribute set) containing names containing malicious characters or that
* could be confused for one another. These entries are at best a sign of
* Unicode mishandling, or some sort of weird name substitution attack if the
* entries do not point to the same inode. Warn if we see multiple dirents
* that do not all point to the same inode.
*
* For extended attributes we perform the same collision checks on the
* attribute, though any collision is enough to trigger a warning.
*
* We avoid flagging these problems as errors because XFS treats names as a
* sequence of arbitrary nonzero bytes. While a Unicode collision is not
* technically a filesystem corruption, we ought to say something if there's a
* possibility for misleading a user. Unquestionably bad things (direction
* overrides, control characters, names that normalize to the same string)
* produce warnings, whereas potentially confusable names produce
* informational messages.
*
* The skeleton algorithm is detailed in section 4 ("Confusable Detection") of
* the Unicode technical standard #39. First we normalize the name, then we
* substitute code points according to the confusable code point table, then
* normalize again.
*
* We take the extra step of removing non-identifier code points such as
* formatting characters, control characters, zero width characters, etc.
* from the skeleton so that we can complain about names that are confusable
* due to invisible control characters.
*
* In other words, skel = remove_invisible(nfd(remap_confusables(nfd(name)))).
*/
struct name_entry {
struct name_entry *next;
/* NFKC normalized name */
UChar *normstr;
size_t normstrlen;
/* Unicode skeletonized name */
UChar *skelstr;
size_t skelstrlen;
xfs_ino_t ino;
/* Raw dirent name */
size_t namelen;
char name[0];
};
#define NAME_ENTRY_SZ(nl) (sizeof(struct name_entry) + 1 + \
(nl * sizeof(uint8_t)))
struct unicrash {
struct scrub_ctx *ctx;
USpoofChecker *spoof;
const UNormalizer2 *normalizer;
bool compare_ino;
bool is_only_root_writeable;
size_t nr_buckets;
struct name_entry *buckets[0];
};
#define UNICRASH_SZ(nr) (sizeof(struct unicrash) + \
(nr * sizeof(struct name_entry *)))
/* Things to complain about in Unicode naming. */
/*
* Multiple names resolve to the same normalized string and therefore render
* identically.
*/
#define UNICRASH_NOT_UNIQUE (1 << 0)
/* Name contains directional overrides. */
#define UNICRASH_BIDI_OVERRIDE (1 << 1)
/* Name mixes left-to-right and right-to-left characters. */
#define UNICRASH_BIDI_MIXED (1 << 2)
/* Control characters in name. */
#define UNICRASH_CONTROL_CHAR (1 << 3)
/* Invisible characters. Only a problem if we have collisions. */
#define UNICRASH_ZERO_WIDTH (1 << 4)
/* Multiple names resolve to the same skeleton string. */
#define UNICRASH_CONFUSABLE (1 << 5)
/*
* We only care about validating utf8 collisions if the underlying
* system configuration says we're using utf8. If the language
* specifier string used to output messages has ".UTF-8" somewhere in
* its name, then we conclude utf8 is in use. Otherwise, no checking is
* performed.
*
* Most modern Linux systems default to utf8, so the only time this
* check will return false is if the administrator configured things
* this way or if things are so messed up there is no locale data at
* all.
*/
#define UTF8_STR ".UTF-8"
#define UTF8_STRLEN (sizeof(UTF8_STR) - 1)
static bool
is_utf8_locale(void)
{
const char *msg_locale;
static int answer = -1;
if (answer != -1)
return answer;
msg_locale = setlocale(LC_MESSAGES, NULL);
if (msg_locale == NULL)
return false;
if (strstr(msg_locale, UTF8_STR) != NULL)
answer = 1;
else
answer = 0;
return answer;
}
/*
* Generate normalized form and skeleton of the name. If this fails, just
* forget everything and return false; this is an advisory checker.
*/
static bool
name_entry_compute_checknames(
struct unicrash *uc,
struct name_entry *entry)
{
UChar *normstr;
UChar *unistr;
UChar *skelstr;
int32_t normstrlen;
int32_t unistrlen;
int32_t skelstrlen;
UChar32 uchr;
int32_t i, j;
UErrorCode uerr = U_ZERO_ERROR;
/* Convert bytestr to unistr for normalization */
u_strFromUTF8(NULL, 0, &unistrlen, entry->name, entry->namelen, &uerr);
if (uerr != U_BUFFER_OVERFLOW_ERROR)
return false;
uerr = U_ZERO_ERROR;
unistr = calloc(unistrlen + 1, sizeof(UChar));
if (!unistr)
return false;
u_strFromUTF8(unistr, unistrlen, NULL, entry->name, entry->namelen,
&uerr);
if (U_FAILURE(uerr))
goto out_unistr;
/* Normalize the string. */
normstrlen = unorm2_normalize(uc->normalizer, unistr, unistrlen, NULL,
0, &uerr);
if (uerr != U_BUFFER_OVERFLOW_ERROR)
goto out_unistr;
uerr = U_ZERO_ERROR;
normstr = calloc(normstrlen + 1, sizeof(UChar));
if (!normstr)
goto out_unistr;
unorm2_normalize(uc->normalizer, unistr, unistrlen, normstr, normstrlen,
&uerr);
if (U_FAILURE(uerr))
goto out_normstr;
/* Compute skeleton. */
skelstrlen = uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, NULL,
0, &uerr);
if (uerr != U_BUFFER_OVERFLOW_ERROR)
goto out_normstr;
uerr = U_ZERO_ERROR;
skelstr = calloc(skelstrlen + 1, sizeof(UChar));
if (!skelstr)
goto out_normstr;
uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, skelstr, skelstrlen,
&uerr);
if (U_FAILURE(uerr))
goto out_skelstr;
/* Remove control/formatting characters from skeleton. */
for (i = 0, j = 0; i < skelstrlen; j = i) {
U16_NEXT_UNSAFE(skelstr, i, uchr);
if (!u_isIDIgnorable(uchr))
continue;
memmove(&skelstr[j], &skelstr[i],
(skelstrlen - i + 1) * sizeof(UChar));
skelstrlen -= (i - j);
i = j;
}
entry->skelstr = skelstr;
entry->skelstrlen = skelstrlen;
entry->normstr = normstr;
entry->normstrlen = normstrlen;
free(unistr);
return true;
out_skelstr:
free(skelstr);
out_normstr:
free(normstr);
out_unistr:
free(unistr);
return false;
}
/* Create a new name entry, returns false if we could not succeed. */
static bool
name_entry_create(
struct unicrash *uc,
const char *name,
xfs_ino_t ino,
struct name_entry **entry)
{
struct name_entry *new_entry;
size_t namelen = strlen(name);
/* Create new entry */
new_entry = calloc(NAME_ENTRY_SZ(namelen), 1);
if (!new_entry)
return false;
new_entry->next = NULL;
new_entry->ino = ino;
memcpy(new_entry->name, name, namelen);
new_entry->name[namelen] = 0;
new_entry->namelen = namelen;
/* Normalize/skeletonize name to find collisions. */
if (!name_entry_compute_checknames(uc, new_entry))
goto out;
*entry = new_entry;
return true;
out:
free(new_entry);
return false;
}
/* Free a name entry */
static void
name_entry_free(
struct name_entry *entry)
{
free(entry->normstr);
free(entry->skelstr);
free(entry);
}
/* Adapt the dirhash function from libxfs, avoid linking with libxfs. */
#define rol32(x, y) (((x) << (y)) | ((x) >> (32 - (y))))
/*
* Implement a simple hash on a character string.
* Rotate the hash value by 7 bits, then XOR each character in.
* This is implemented with some source-level loop unrolling.
*/
static xfs_dahash_t
name_entry_hash(
struct name_entry *entry)
{
uint8_t *name;
size_t namelen;
xfs_dahash_t hash;
name = (uint8_t *)entry->skelstr;
namelen = entry->skelstrlen * sizeof(UChar);
/*
* Do four characters at a time as long as we can.
*/
for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
(name[3] << 0) ^ rol32(hash, 7 * 4);
/*
* Now do the rest of the characters.
*/
switch (namelen) {
case 3:
return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
rol32(hash, 7 * 3);
case 2:
return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
case 1:
return (name[0] << 0) ^ rol32(hash, 7 * 1);
default: /* case 0: */
return hash;
}
}
/*
* Check a name for suspicious elements that have appeared in filename
* spoofing attacks. This includes names that mixed directions or contain
* direction overrides control characters, both of which have appeared in
* filename spoofing attacks.
*/
static void
name_entry_examine(
struct name_entry *entry,
unsigned int *badflags)
{
UChar32 uchr;
int32_t i;
uint8_t mask = 0;
for (i = 0; i < entry->normstrlen;) {
U16_NEXT_UNSAFE(entry->normstr, i, uchr);
/* zero width character sequences */
switch (uchr) {
case 0x200B: /* zero width space */
case 0x200C: /* zero width non-joiner */
case 0x200D: /* zero width joiner */
case 0xFEFF: /* zero width non breaking space */
case 0x2060: /* word joiner */
case 0x2061: /* function application */
case 0x2062: /* invisible times (multiply) */
case 0x2063: /* invisible separator (comma) */
case 0x2064: /* invisible plus (addition) */
*badflags |= UNICRASH_ZERO_WIDTH;
break;
}
/* control characters */
if (u_iscntrl(uchr))
*badflags |= UNICRASH_CONTROL_CHAR;
switch (u_charDirection(uchr)) {
case U_LEFT_TO_RIGHT:
mask |= 0x01;
break;
case U_RIGHT_TO_LEFT:
mask |= 0x02;
break;
case U_RIGHT_TO_LEFT_OVERRIDE:
*badflags |= UNICRASH_BIDI_OVERRIDE;
break;
case U_LEFT_TO_RIGHT_OVERRIDE:
*badflags |= UNICRASH_BIDI_OVERRIDE;
break;
default:
break;
}
}
/* mixing left-to-right and right-to-left chars */
if (mask == 0x3)
*badflags |= UNICRASH_BIDI_MIXED;
}
/* Initialize the collision detector. */
static int
unicrash_init(
struct unicrash **ucp,
struct scrub_ctx *ctx,
bool compare_ino,
size_t nr_buckets,
bool is_only_root_writeable)
{
struct unicrash *p;
UErrorCode uerr = U_ZERO_ERROR;
if (!is_utf8_locale()) {
*ucp = NULL;
return 0;
}
if (nr_buckets > 65536)
nr_buckets = 65536;
else if (nr_buckets < 16)
nr_buckets = 16;
p = calloc(1, UNICRASH_SZ(nr_buckets));
if (!p)
return errno;
p->ctx = ctx;
p->nr_buckets = nr_buckets;
p->compare_ino = compare_ino;
p->normalizer = unorm2_getNFKCInstance(&uerr);
if (U_FAILURE(uerr))
goto out_free;
p->spoof = uspoof_open(&uerr);
if (U_FAILURE(uerr))
goto out_free;
uspoof_setChecks(p->spoof, USPOOF_ALL_CHECKS, &uerr);
if (U_FAILURE(uerr))
goto out_spoof;
p->is_only_root_writeable = is_only_root_writeable;
*ucp = p;
return 0;
out_spoof:
uspoof_close(p->spoof);
out_free:
free(p);
return ENOMEM;
}
/*
* Is this inode owned by root and not writable by others? If so, skip
* even the informational messages, because this was put in place by the
* administrator.
*/
static bool
is_only_root_writable(
struct xfs_bulkstat *bstat)
{
if (bstat->bs_uid != 0 || bstat->bs_gid != 0)
return false;
return !(bstat->bs_mode & S_IWOTH);
}
/* Initialize the collision detector for a directory. */
int
unicrash_dir_init(
struct unicrash **ucp,
struct scrub_ctx *ctx,
struct xfs_bulkstat *bstat)
{
/*
* Assume 64 bytes per dentry, clamp buckets between 16 and 64k.
* Same general idea as dir_hash_init in xfs_repair.
*/
return unicrash_init(ucp, ctx, true, bstat->bs_size / 64,
is_only_root_writable(bstat));
}
/* Initialize the collision detector for an extended attribute. */
int
unicrash_xattr_init(
struct unicrash **ucp,
struct scrub_ctx *ctx,
struct xfs_bulkstat *bstat)
{
/* Assume 16 attributes per extent for lack of a better idea. */
return unicrash_init(ucp, ctx, false, 16 * (1 + bstat->bs_aextents),
is_only_root_writable(bstat));
}
/* Initialize the collision detector for a filesystem label. */
int
unicrash_fs_label_init(
struct unicrash **ucp,
struct scrub_ctx *ctx)
{
return unicrash_init(ucp, ctx, false, 16, true);
}
/* Free the crash detector. */
void
unicrash_free(
struct unicrash *uc)
{
struct name_entry *ne;
struct name_entry *x;
size_t i;
if (!uc)
return;
uspoof_close(uc->spoof);
for (i = 0; i < uc->nr_buckets; i++) {
for (ne = uc->buckets[i]; ne != NULL; ne = x) {
x = ne->next;
name_entry_free(ne);
}
}
free(uc);
}
/* Complain about Unicode problems. */
static void
unicrash_complain(
struct unicrash *uc,
struct descr *dsc,
const char *what,
struct name_entry *entry,
unsigned int badflags,
struct name_entry *dup_entry)
{
char *bad1 = NULL;
char *bad2 = NULL;
bad1 = string_escape(entry->name);
if (dup_entry)
bad2 = string_escape(dup_entry->name);
/*
* Most filechooser UIs do not look for bidirectional overrides when
* they render names. This can result in misleading name presentation
* that makes "hig<rtl>gnp.sh" render like "highs.png".
*/
if (badflags & UNICRASH_BIDI_OVERRIDE) {
str_warn(uc->ctx, descr_render(dsc),
_("Unicode name \"%s\" in %s contains suspicious text direction overrides."),
bad1, what);
goto out;
}
/*
* Two names that normalize to the same string will render
* identically even though the filesystem considers them unique
* names. "cafe\xcc\x81" and "caf\xc3\xa9" have different byte
* sequences, but they both appear as "café".
*/
if (badflags & UNICRASH_NOT_UNIQUE) {
str_warn(uc->ctx, descr_render(dsc),
_("Unicode name \"%s\" in %s renders identically to \"%s\"."),
bad1, what, bad2);
goto out;
}
/*
* If a name contains invisible/nonprinting characters and can be
* confused with another name as a result, we should complain.
* "moo<zerowidthspace>cow" and "moocow" are misleading.
*/
if ((badflags & UNICRASH_ZERO_WIDTH) &&
(badflags & UNICRASH_CONFUSABLE)) {
str_warn(uc->ctx, descr_render(dsc),
_("Unicode name \"%s\" in %s could be confused with '%s' due to invisible characters."),
bad1, what, bad2);
goto out;
}
/*
* Unfiltered control characters can mess up your terminal and render
* invisibly in filechooser UIs.
*/
if (badflags & UNICRASH_CONTROL_CHAR) {
str_warn(uc->ctx, descr_render(dsc),
_("Unicode name \"%s\" in %s contains control characters."),
bad1, what);
goto out;
}
/*
* Skip the informational messages if the inode owning the name is
* only writeable by root, because those files were put there by the
* sysadmin. Also skip names less than four letters long because
* there's a much higher chance of collisions with short names.
*/
if (!verbose && (uc->is_only_root_writeable || entry->namelen < 4))
goto out;
/*
* It's not considered good practice (says Unicode) to mix LTR
* characters with RTL characters. The mere presence of different
* bidirectional characters isn't enough to trip up software, so don't
* warn about this too loudly.
*/
if (badflags & UNICRASH_BIDI_MIXED) {
str_info(uc->ctx, descr_render(dsc),
_("Unicode name \"%s\" in %s mixes bidirectional characters."),
bad1, what);
goto out;
}
/*
* We'll note if two names could be confusable with each other, but
* whether or not the user will actually confuse them is dependent
* on the rendering system and the typefaces in use. Maybe "foo.1"
* and "moo.l" look the same, maybe they do not.
*/
if (badflags & UNICRASH_CONFUSABLE) {
str_info(uc->ctx, descr_render(dsc),
_("Unicode name \"%s\" in %s could be confused with \"%s\"."),
bad1, what, bad2);
}
out:
free(bad1);
free(bad2);
}
/*
* Try to add a name -> ino entry to the collision detector. The name
* must be skeletonized according to Unicode TR39 to detect names that
* could be visually confused with each other.
*/
static void
unicrash_add(
struct unicrash *uc,
struct name_entry *new_entry,
unsigned int *badflags,
struct name_entry **existing_entry)
{
struct name_entry *entry;
size_t bucket;
xfs_dahash_t hash;
/* Store name in hashtable. */
hash = name_entry_hash(new_entry);
bucket = hash % uc->nr_buckets;
entry = uc->buckets[bucket];
new_entry->next = entry;
uc->buckets[bucket] = new_entry;
while (entry != NULL) {
/*
* If we see the same byte sequence then someone's modifying
* the namespace while we're scanning it. Update the existing
* entry's inode mapping and erase the new entry from existence.
*/
if (new_entry->namelen == entry->namelen &&
!memcmp(new_entry->name, entry->name, entry->namelen)) {
entry->ino = new_entry->ino;
uc->buckets[bucket] = new_entry->next;
name_entry_free(new_entry);
*badflags = 0;
return;
}
/* Same normalization? */
if (new_entry->normstrlen == entry->normstrlen &&
!u_strcmp(new_entry->normstr, entry->normstr) &&
(uc->compare_ino ? entry->ino != new_entry->ino : true)) {
*badflags |= UNICRASH_NOT_UNIQUE;
*existing_entry = entry;
return;
}
/* Confusable? */
if (new_entry->skelstrlen == entry->skelstrlen &&
!u_strcmp(new_entry->skelstr, entry->skelstr) &&
(uc->compare_ino ? entry->ino != new_entry->ino : true)) {
*badflags |= UNICRASH_CONFUSABLE;
*existing_entry = entry;
return;
}
entry = entry->next;
}
}
/* Check a name for unicode normalization problems or collisions. */
static int
__unicrash_check_name(
struct unicrash *uc,
struct descr *dsc,
const char *namedescr,
const char *name,
xfs_ino_t ino)
{
struct name_entry *dup_entry = NULL;
struct name_entry *new_entry = NULL;
unsigned int badflags = 0;
/* If we can't create entry data, just skip it. */
if (!name_entry_create(uc, name, ino, &new_entry))
return 0;
name_entry_examine(new_entry, &badflags);
unicrash_add(uc, new_entry, &badflags, &dup_entry);
if (badflags)
unicrash_complain(uc, dsc, namedescr, new_entry, badflags,
dup_entry);
return 0;
}
/*
* Check a directory entry for unicode normalization problems or collisions.
* If errors occur, this function will log them and return nonzero.
*/
int
unicrash_check_dir_name(
struct unicrash *uc,
struct descr *dsc,
struct dirent *dentry)
{
if (!uc)
return 0;
return __unicrash_check_name(uc, dsc, _("directory"),
dentry->d_name, dentry->d_ino);
}
/*
* Check an extended attribute name for unicode normalization problems
* or collisions. If errors occur, this function will log them and return
* nonzero.
*/
int
unicrash_check_xattr_name(
struct unicrash *uc,
struct descr *dsc,
const char *attrname)
{
if (!uc)
return 0;
return __unicrash_check_name(uc, dsc, _("extended attribute"),
attrname, 0);
}
/*
* Check the fs label for unicode normalization problems or misleading bits.
* If errors occur, this function will log them and return nonzero.
*/
int
unicrash_check_fs_label(
struct unicrash *uc,
struct descr *dsc,
const char *label)
{
if (!uc)
return 0;
return __unicrash_check_name(uc, dsc, _("filesystem label"),
label, 0);
}
/* Load libicu and initialize it. */
bool
unicrash_load(void)
{
UErrorCode uerr = U_ZERO_ERROR;
u_init(&uerr);
return U_FAILURE(uerr);
}
/* Unload libicu once we're done with it. */
void
unicrash_unload(void)
{
u_cleanup();
}