blob: cd950b4458156111170317477c5d2d4cfbbc50da [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2007, 2011 SGI
* All Rights Reserved.
*/
#include "libxfs.h"
#include "init.h"
#include "obfuscate.h"
static inline unsigned char
random_filename_char(void)
{
static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789-_";
return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
}
#define rol32(x,y) (((x) << (y)) | ((x) >> (32 - (y))))
/*
* Given a name and its hash value, massage the name in such a way
* that the result is another name of equal length which shares the
* same hash value.
*/
void
obfuscate_name(
xfs_dahash_t hash,
size_t name_len,
unsigned char *name,
bool is_dirent)
{
unsigned char *oldname = NULL;
unsigned char *newp;
int i;
xfs_dahash_t new_hash;
unsigned char *first;
unsigned char high_bit;
int tries = 0;
bool is_ci_name = is_dirent && xfs_has_asciici(mp);
int shift;
/*
* Our obfuscation algorithm requires at least 5-character
* names, so don't bother if the name is too short. We
* work backward from a hash value to determine the last
* five bytes in a name required to produce a new name
* with the same hash.
*/
if (name_len < 5)
return;
if (is_ci_name) {
oldname = malloc(name_len);
if (!oldname)
return;
memcpy(oldname, name, name_len);
}
again:
newp = name;
new_hash = 0;
/*
* If we cannot generate a ci-compatible obfuscated name after 1000
* tries, don't bother obfuscating the name.
*/
if (tries++ > 1000) {
memcpy(name, oldname, name_len);
goto out_free;
}
/*
* The beginning of the obfuscated name can be pretty much
* anything, so fill it in with random characters.
* Accumulate its new hash value as we go.
*/
for (i = 0; i < name_len - 5; i++) {
*newp = random_filename_char();
if (is_ci_name)
new_hash = xfs_ascii_ci_xfrm(*newp) ^
rol32(new_hash, 7);
else
new_hash = *newp ^ rol32(new_hash, 7);
newp++;
}
/*
* Compute which five bytes need to be used at the end of
* the name so the hash of the obfuscated name is the same
* as the hash of the original. If any result in an invalid
* character, flip a bit and arrange for a corresponding bit
* in a neighboring byte to be flipped as well. For the
* last byte, the "neighbor" to change is the first byte
* we're computing here.
*/
new_hash = rol32(new_hash, 3) ^ hash;
first = newp;
high_bit = 0;
for (shift = 28; shift >= 0; shift -= 7) {
*newp = (new_hash >> shift & 0x7f) ^ high_bit;
if (is_invalid_char(*newp)) {
*newp ^= 1;
high_bit = 0x80;
} else
high_bit = 0;
/*
* If ascii-ci is enabled, uppercase characters are converted
* to lowercase characters while computing the name hash. If
* any of the necessary correction bytes are uppercase, the
* hash of the new name will not match. Try again with a
* different prefix.
*/
if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
goto again;
ASSERT(!is_invalid_char(*newp));
newp++;
}
/*
* If we flipped a bit on the last byte, we need to fix up
* the matching bit in the first byte. The result will
* be a valid character, because we know that first byte
* has 0's in its upper four bits (it was produced by a
* 28-bit right-shift of a 32-bit unsigned value).
*/
if (high_bit) {
*first ^= 0x10;
if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
goto again;
ASSERT(!is_invalid_char(*first));
}
out_free:
free(oldname);
}
/*
* Flip a bit in each of two bytes at the end of the given name.
* This is used in generating a series of alternate names to be used
* in the event a duplicate is found.
*
* The bits flipped are selected such that they both affect the same
* bit in the name's computed hash value, so flipping them both will
* preserve the hash.
*
* The following diagram aims to show the portion of a computed
* hash that a given byte of a name affects.
*
* 31 28 24 21 14 8 7 3 0
* +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
* hash: | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
* +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
* last-4 ->| |<-- last-2 --->| |<--- last ---->|
* |<-- last-3 --->| |<-- last-1 --->| |<- last-4
* |<-- last-7 --->| |<-- last-5 --->|
* |<-- last-8 --->| |<-- last-6 --->|
* . . . and so on
*
* The last byte of the name directly affects the low-order byte of
* the hash. The next-to-last affects bits 7-14, the next one back
* affects bits 14-21, and so on. The effect wraps around when it
* goes beyond the top of the hash (as happens for byte last-4).
*
* Bits that are flipped together "overlap" on the hash value. As
* an example of overlap, the last two bytes both affect bit 7 in
* the hash. That pair of bytes (and their overlapping bits) can be
* used for this "flip bit" operation (it's the first pair tried,
* actually).
*
* A table defines overlapping pairs--the bytes involved and bits
* within them--that can be used this way. The byte offset is
* relative to a starting point within the name, which will be set
* to affect the bytes at the end of the name. The function is
* called with a "bitseq" value which indicates which bit flip is
* desired, and this translates directly into selecting which entry
* in the bit_to_flip[] table to apply.
*
* The function returns 1 if the operation was successful. It
* returns 0 if the result produced a character that's not valid in
* a name (either '/' or a '\0'). Finally, it returns -1 if the bit
* sequence number is beyond what is supported for a name of this
* length.
*
* Discussion
* ----------
* (Also see the discussion above find_alternate(), below.)
*
* In order to make this function work for any length name, the
* table is ordered by increasing byte offset, so that the earliest
* entries can apply to the shortest strings. This way all names
* are done consistently.
*
* When bit flips occur, they can convert printable characters
* into non-printable ones. In an effort to reduce the impact of
* this, the first bit flips are chosen to affect bytes the end of
* the name (and furthermore, toward the low bits of a byte). Those
* bytes are often non-printable anyway because of the way they are
* initially selected by obfuscate_name()). This is accomplished,
* using later table entries first.
*
* Each row in the table doubles the number of alternates that
* can be generated. A two-byte name is limited to using only
* the first row, so it's possible to generate two alternates
* (the original name, plus the alternate produced by flipping
* the one pair of bits). In a 5-byte name, the effect of the
* first byte overlaps the last by 4 its, and there are 8 bits
* to flip, allowing for 256 possible alternates.
*
* Short names (less than 5 bytes) are never even obfuscated, so for
* such names the relatively small number of alternates should never
* really be a problem.
*
* Long names (more than 6 bytes, say) are not likely to exhaust
* the number of available alternates. In fact, the table could
* probably have stopped at 8 entries, on the assumption that 256
* alternates should be enough for most any situation. The entries
* beyond those are present mostly for demonstration of how it could
* be populated with more entries, should it ever be necessary to do
* so.
*/
static int
flip_bit(
size_t name_len,
unsigned char *name,
uint32_t bitseq)
{
int index;
size_t offset;
unsigned char *p0, *p1;
unsigned char m0, m1;
struct {
int byte; /* Offset from start within name */
unsigned char bit; /* Bit within that byte */
} bit_to_flip[][2] = { /* Sorted by second entry's byte */
{ { 0, 0 }, { 1, 7 } }, /* Each row defines a pair */
{ { 1, 0 }, { 2, 7 } }, /* of bytes and a bit within */
{ { 2, 0 }, { 3, 7 } }, /* each byte. Each bit in */
{ { 0, 4 }, { 4, 0 } }, /* a pair affects the same */
{ { 0, 5 }, { 4, 1 } }, /* bit in the hash, so flipping */
{ { 0, 6 }, { 4, 2 } }, /* both will change the name */
{ { 0, 7 }, { 4, 3 } }, /* while preserving the hash. */
{ { 3, 0 }, { 4, 7 } },
{ { 0, 0 }, { 5, 3 } }, /* The first entry's byte offset */
{ { 0, 1 }, { 5, 4 } }, /* must be less than the second. */
{ { 0, 2 }, { 5, 5 } },
{ { 0, 3 }, { 5, 6 } }, /* The table can be extended to */
{ { 0, 4 }, { 5, 7 } }, /* an arbitrary number of entries */
{ { 4, 0 }, { 5, 7 } }, /* but there's not much point. */
/* . . . */
};
/* Find the first entry *not* usable for name of this length */
for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
if (bit_to_flip[index][1].byte >= name_len)
break;
/*
* Back up to the last usable entry. If that number is
* smaller than the bit sequence number, inform the caller
* that nothing this large (or larger) will work.
*/
if (bitseq > --index)
return -1;
/*
* We will be switching bits at the end of name, with a
* preference for affecting the last bytes first. Compute
* where in the name we'll start applying the changes.
*/
offset = name_len - (bit_to_flip[index][1].byte + 1);
index -= bitseq; /* Use later table entries first */
p0 = name + offset + bit_to_flip[index][0].byte;
p1 = name + offset + bit_to_flip[index][1].byte;
m0 = 1 << bit_to_flip[index][0].bit;
m1 = 1 << bit_to_flip[index][1].bit;
/* Only change the bytes if it produces valid characters */
if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
return 0;
*p0 ^= m0;
*p1 ^= m1;
return 1;
}
/*
* This function generates a well-defined sequence of "alternate"
* names for a given name. An alternate is a name having the same
* length and same hash value as the original name. This is needed
* because the algorithm produces only one obfuscated name to use
* for a given original name, and it's possible that result matches
* a name already seen. This function checks for this, and if it
* occurs, finds another suitable obfuscated name to use.
*
* Each bit in the binary representation of the sequence number is
* used to select one possible "bit flip" operation to perform on
* the name. So for example:
* seq = 0: selects no bits to flip
* seq = 1: selects the 0th bit to flip
* seq = 2: selects the 1st bit to flip
* seq = 3: selects the 0th and 1st bit to flip
* ... and so on.
*
* The flip_bit() function takes care of the details of the bit
* flipping within the name. Note that the "1st bit" in this
* context is a bit sequence number; i.e. it doesn't necessarily
* mean bit 0x02 will be changed.
*
* If a valid name (one that contains no '/' or '\0' characters) is
* produced by this process for the given sequence number, this
* function returns 1. If the result is not valid, it returns 0.
* Returns -1 if the sequence number is beyond the the maximum for
* names of the given length.
*
*
* Discussion
* ----------
* The number of alternates available for a given name is dependent
* on its length. A "bit flip" involves inverting two bits in
* a name--the two bits being selected such that their values
* affect the name's hash value in the same way. Alternates are
* thus generated by inverting the value of pairs of such
* "overlapping" bits in the original name. Each byte after the
* first in a name adds at least one bit of overlap to work with.
* (See comments above flip_bit() for more discussion on this.)
*
* So the number of alternates is dependent on the number of such
* overlapping bits in a name. If there are N bit overlaps, there
* 2^N alternates for that hash value.
*
* Here are the number of overlapping bits available for generating
* alternates for names of specific lengths:
* 1 0 (must have 2 bytes to have any overlap)
* 2 1 One bit overlaps--so 2 possible alternates
* 3 2 Two bits overlap--so 4 possible alternates
* 4 4 Three bits overlap, so 2^3 alternates
* 5 8 8 bits overlap (due to wrapping), 256 alternates
* 6 18 2^18 alternates
* 7 28 2^28 alternates
* ...
* It's clear that the number of alternates grows very quickly with
* the length of the name. But note that the set of alternates
* includes invalid names. And for certain (contrived) names, the
* number of valid names is a fairly small fraction of the total
* number of alternates.
*
* The main driver for this infrastructure for coming up with
* alternate names is really related to names 5 (or possibly 6)
* bytes in length. 5-byte obfuscated names contain no randomly-
* generated bytes in them, and the chance of an obfuscated name
* matching an already-seen name is too high to just ignore. This
* methodical selection of alternates ensures we don't produce
* duplicate names unless we have exhausted our options.
*/
int
find_alternate(
size_t name_len,
unsigned char *name,
uint32_t seq)
{
uint32_t bitseq = 0;
uint32_t bits = seq;
if (!seq)
return 1; /* alternate 0 is the original name */
if (name_len < 2) /* Must have 2 bytes to flip */
return -1;
for (bitseq = 0; bits; bitseq++) {
uint32_t mask = 1 << bitseq;
int fb;
if (!(bits & mask))
continue;
fb = flip_bit(name_len, name, bitseq);
if (fb < 1)
return fb ? -1 : 0;
bits ^= mask;
}
return 1;
}