1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
/// BoolTrie is a trie for representing a set of Unicode codepoints. It is
/// implemented with postfix compression (sharing of identical child nodes),
/// which gives both compact size and fast lookup.
///
/// The space of Unicode codepoints is divided into 3 subareas, each
/// represented by a trie with different depth. In the first (0..0x800), there
/// is no trie structure at all; each u64 entry corresponds to a bitvector
/// effectively holding 64 bool values.
///
/// In the second (0x800..0x10000), each child of the root node represents a
/// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
/// the trie stores an 8-bit index into a shared table of leaf values. This
/// exploits the fact that in reasonable sets, many such leaves can be shared.
///
/// In the third (0x10000..0x110000), each child of the root node represents a
/// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
/// of a child tree. Each of these 64 bytes represents an index into the table
/// of shared 64-bit leaf values. This exploits the sparse structure in the
/// non-BMP range of most Unicode sets.
pub struct BoolTrie {
    // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
    pub r1: [u64; 32],   // leaves

    // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
    pub r2: [u8; 992],      // first level
    pub r3: &'static [u64],  // leaves

    // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
    pub r4: [u8; 256],       // first level
    pub r5: &'static [u8],   // second level
    pub r6: &'static [u64],  // leaves
}
impl BoolTrie {
    pub fn lookup(&self, c: char) -> bool {
        let c = c as u32;
        if c < 0x800 {
            trie_range_leaf(c, self.r1[(c >> 6) as usize])
        } else if c < 0x10000 {
            let child = self.r2[(c >> 6) as usize - 0x20];
            trie_range_leaf(c, self.r3[child as usize])
        } else {
            let child = self.r4[(c >> 12) as usize - 0x10];
            let leaf = self.r5[((child as usize) << 6) + ((c >> 6) as usize & 0x3f)];
            trie_range_leaf(c, self.r6[leaf as usize])
        }
    }
}

pub struct SmallBoolTrie {
    pub(crate) r1: &'static [u8],  // first level
    pub(crate) r2: &'static [u64],  // leaves
}

impl SmallBoolTrie {
    pub fn lookup(&self, c: char) -> bool {
        let c = c as u32;
        match self.r1.get((c >> 6) as usize) {
            Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
            None => false,
        }
    }
}

fn trie_range_leaf(c: u32, bitmap_chunk: u64) -> bool {
    ((bitmap_chunk >> (c & 63)) & 1) != 0
}