Chore: move "geodata" to package "component"

2021-11-02 18:23:01 +08:00
parent 325b7f455f
commit b515a4e270
19 changed files with 163 additions and 152 deletions
--- a/component/geodata/strmatcher/ac_automaton_matcher.go
+++ b/component/geodata/strmatcher/ac_automaton_matcher.go
@ -0,0 +1,241 @@
+package strmatcher
+
+import (
+	"container/list"
+)
+
+const validCharCount = 53
+
+type MatchType struct {
+	matchType Type
+	exist     bool
+}
+
+const (
+	TrieEdge bool = true
+	FailEdge bool = false
+)
+
+type Edge struct {
+	edgeType bool
+	nextNode int
+}
+
+type ACAutomaton struct {
+	trie   [][validCharCount]Edge
+	fail   []int
+	exists []MatchType
+	count  int
+}
+
+func newNode() [validCharCount]Edge {
+	var s [validCharCount]Edge
+	for i := range s {
+		s[i] = Edge{
+			edgeType: FailEdge,
+			nextNode: 0,
+		}
+	}
+	return s
+}
+
+var char2Index = []int{
+	'A':  0,
+	'a':  0,
+	'B':  1,
+	'b':  1,
+	'C':  2,
+	'c':  2,
+	'D':  3,
+	'd':  3,
+	'E':  4,
+	'e':  4,
+	'F':  5,
+	'f':  5,
+	'G':  6,
+	'g':  6,
+	'H':  7,
+	'h':  7,
+	'I':  8,
+	'i':  8,
+	'J':  9,
+	'j':  9,
+	'K':  10,
+	'k':  10,
+	'L':  11,
+	'l':  11,
+	'M':  12,
+	'm':  12,
+	'N':  13,
+	'n':  13,
+	'O':  14,
+	'o':  14,
+	'P':  15,
+	'p':  15,
+	'Q':  16,
+	'q':  16,
+	'R':  17,
+	'r':  17,
+	'S':  18,
+	's':  18,
+	'T':  19,
+	't':  19,
+	'U':  20,
+	'u':  20,
+	'V':  21,
+	'v':  21,
+	'W':  22,
+	'w':  22,
+	'X':  23,
+	'x':  23,
+	'Y':  24,
+	'y':  24,
+	'Z':  25,
+	'z':  25,
+	'!':  26,
+	'$':  27,
+	'&':  28,
+	'\'': 29,
+	'(':  30,
+	')':  31,
+	'*':  32,
+	'+':  33,
+	',':  34,
+	';':  35,
+	'=':  36,
+	':':  37,
+	'%':  38,
+	'-':  39,
+	'.':  40,
+	'_':  41,
+	'~':  42,
+	'0':  43,
+	'1':  44,
+	'2':  45,
+	'3':  46,
+	'4':  47,
+	'5':  48,
+	'6':  49,
+	'7':  50,
+	'8':  51,
+	'9':  52,
+}
+
+func NewACAutomaton() *ACAutomaton {
+	ac := new(ACAutomaton)
+	ac.trie = append(ac.trie, newNode())
+	ac.fail = append(ac.fail, 0)
+	ac.exists = append(ac.exists, MatchType{
+		matchType: Full,
+		exist:     false,
+	})
+	return ac
+}
+
+func (ac *ACAutomaton) Add(domain string, t Type) {
+	node := 0
+	for i := len(domain) - 1; i >= 0; i-- {
+		idx := char2Index[domain[i]]
+		if ac.trie[node][idx].nextNode == 0 {
+			ac.count++
+			if len(ac.trie) < ac.count+1 {
+				ac.trie = append(ac.trie, newNode())
+				ac.fail = append(ac.fail, 0)
+				ac.exists = append(ac.exists, MatchType{
+					matchType: Full,
+					exist:     false,
+				})
+			}
+			ac.trie[node][idx] = Edge{
+				edgeType: TrieEdge,
+				nextNode: ac.count,
+			}
+		}
+		node = ac.trie[node][idx].nextNode
+	}
+	ac.exists[node] = MatchType{
+		matchType: t,
+		exist:     true,
+	}
+	switch t {
+	case Domain:
+		ac.exists[node] = MatchType{
+			matchType: Full,
+			exist:     true,
+		}
+		idx := char2Index['.']
+		if ac.trie[node][idx].nextNode == 0 {
+			ac.count++
+			if len(ac.trie) < ac.count+1 {
+				ac.trie = append(ac.trie, newNode())
+				ac.fail = append(ac.fail, 0)
+				ac.exists = append(ac.exists, MatchType{
+					matchType: Full,
+					exist:     false,
+				})
+			}
+			ac.trie[node][idx] = Edge{
+				edgeType: TrieEdge,
+				nextNode: ac.count,
+			}
+		}
+		node = ac.trie[node][idx].nextNode
+		ac.exists[node] = MatchType{
+			matchType: t,
+			exist:     true,
+		}
+	default:
+		break
+	}
+}
+
+func (ac *ACAutomaton) Build() {
+	queue := list.New()
+	for i := 0; i < validCharCount; i++ {
+		if ac.trie[0][i].nextNode != 0 {
+			queue.PushBack(ac.trie[0][i])
+		}
+	}
+	for {
+		front := queue.Front()
+		if front == nil {
+			break
+		} else {
+			node := front.Value.(Edge).nextNode
+			queue.Remove(front)
+			for i := 0; i < validCharCount; i++ {
+				if ac.trie[node][i].nextNode != 0 {
+					ac.fail[ac.trie[node][i].nextNode] = ac.trie[ac.fail[node]][i].nextNode
+					queue.PushBack(ac.trie[node][i])
+				} else {
+					ac.trie[node][i] = Edge{
+						edgeType: FailEdge,
+						nextNode: ac.trie[ac.fail[node]][i].nextNode,
+					}
+				}
+			}
+		}
+	}
+}
+
+func (ac *ACAutomaton) Match(s string) bool {
+	node := 0
+	fullMatch := true
+	// 1. the match string is all through trie edge. FULL MATCH or DOMAIN
+	// 2. the match string is through a fail edge. NOT FULL MATCH
+	// 2.1 Through a fail edge, but there exists a valid node. SUBSTR
+	for i := len(s) - 1; i >= 0; i-- {
+		idx := char2Index[s[i]]
+		fullMatch = fullMatch && ac.trie[node][idx].edgeType
+		node = ac.trie[node][idx].nextNode
+		switch ac.exists[node].matchType {
+		case Substr:
+			return true
+		case Domain:
+			if fullMatch {
+				return true
+			}
+		}
+	}
+	return fullMatch && ac.exists[node].exist
+}
--- a/component/geodata/strmatcher/domain_matcher.go
+++ b/component/geodata/strmatcher/domain_matcher.go
@ -0,0 +1,98 @@
+package strmatcher
+
+import "strings"
+
+func breakDomain(domain string) []string {
+	return strings.Split(domain, ".")
+}
+
+type node struct {
+	values []uint32
+	sub    map[string]*node
+}
+
+// DomainMatcherGroup is a IndexMatcher for a large set of Domain matchers.
+// Visible for testing only.
+type DomainMatcherGroup struct {
+	root *node
+}
+
+func (g *DomainMatcherGroup) Add(domain string, value uint32) {
+	if g.root == nil {
+		g.root = new(node)
+	}
+
+	current := g.root
+	parts := breakDomain(domain)
+	for i := len(parts) - 1; i >= 0; i-- {
+		part := parts[i]
+		if current.sub == nil {
+			current.sub = make(map[string]*node)
+		}
+		next := current.sub[part]
+		if next == nil {
+			next = new(node)
+			current.sub[part] = next
+		}
+		current = next
+	}
+
+	current.values = append(current.values, value)
+}
+
+func (g *DomainMatcherGroup) addMatcher(m domainMatcher, value uint32) {
+	g.Add(string(m), value)
+}
+
+func (g *DomainMatcherGroup) Match(domain string) []uint32 {
+	if domain == "" {
+		return nil
+	}
+
+	current := g.root
+	if current == nil {
+		return nil
+	}
+
+	nextPart := func(idx int) int {
+		for i := idx - 1; i >= 0; i-- {
+			if domain[i] == '.' {
+				return i
+			}
+		}
+		return -1
+	}
+
+	matches := [][]uint32{}
+	idx := len(domain)
+	for {
+		if idx == -1 || current.sub == nil {
+			break
+		}
+
+		nidx := nextPart(idx)
+		part := domain[nidx+1 : idx]
+		next := current.sub[part]
+		if next == nil {
+			break
+		}
+		current = next
+		idx = nidx
+		if len(current.values) > 0 {
+			matches = append(matches, current.values)
+		}
+	}
+	switch len(matches) {
+	case 0:
+		return nil
+	case 1:
+		return matches[0]
+	default:
+		result := []uint32{}
+		for idx := range matches {
+			// Insert reversely, the subdomain that matches further ranks higher
+			result = append(result, matches[len(matches)-1-idx]...)
+		}
+		return result
+	}
+}
--- a/component/geodata/strmatcher/full_matcher.go
+++ b/component/geodata/strmatcher/full_matcher.go
@ -0,0 +1,25 @@
+package strmatcher
+
+type FullMatcherGroup struct {
+	matchers map[string][]uint32
+}
+
+func (g *FullMatcherGroup) Add(domain string, value uint32) {
+	if g.matchers == nil {
+		g.matchers = make(map[string][]uint32)
+	}
+
+	g.matchers[domain] = append(g.matchers[domain], value)
+}
+
+func (g *FullMatcherGroup) addMatcher(m fullMatcher, value uint32) {
+	g.Add(string(m), value)
+}
+
+func (g *FullMatcherGroup) Match(str string) []uint32 {
+	if g.matchers == nil {
+		return nil
+	}
+
+	return g.matchers[str]
+}
--- a/component/geodata/strmatcher/matchers.go
+++ b/component/geodata/strmatcher/matchers.go
@ -0,0 +1,52 @@
+package strmatcher
+
+import (
+	"regexp"
+	"strings"
+)
+
+type fullMatcher string
+
+func (m fullMatcher) Match(s string) bool {
+	return string(m) == s
+}
+
+func (m fullMatcher) String() string {
+	return "full:" + string(m)
+}
+
+type substrMatcher string
+
+func (m substrMatcher) Match(s string) bool {
+	return strings.Contains(s, string(m))
+}
+
+func (m substrMatcher) String() string {
+	return "keyword:" + string(m)
+}
+
+type domainMatcher string
+
+func (m domainMatcher) Match(s string) bool {
+	pattern := string(m)
+	if !strings.HasSuffix(s, pattern) {
+		return false
+	}
+	return len(s) == len(pattern) || s[len(s)-len(pattern)-1] == '.'
+}
+
+func (m domainMatcher) String() string {
+	return "domain:" + string(m)
+}
+
+type regexMatcher struct {
+	pattern *regexp.Regexp
+}
+
+func (m *regexMatcher) Match(s string) bool {
+	return m.pattern.MatchString(s)
+}
+
+func (m *regexMatcher) String() string {
+	return "regexp:" + m.pattern.String()
+}
--- a/component/geodata/strmatcher/mph_matcher.go
+++ b/component/geodata/strmatcher/mph_matcher.go
@ -0,0 +1,304 @@
+package strmatcher
+
+import (
+	"math/bits"
+	"regexp"
+	"sort"
+	"strings"
+	"unsafe"
+)
+
+// PrimeRK is the prime base used in Rabin-Karp algorithm.
+const PrimeRK = 16777619
+
+// calculate the rolling murmurHash of given string
+func RollingHash(s string) uint32 {
+	h := uint32(0)
+	for i := len(s) - 1; i >= 0; i-- {
+		h = h*PrimeRK + uint32(s[i])
+	}
+	return h
+}
+
+// A MphMatcherGroup is divided into three parts:
+// 1. `full` and `domain` patterns are matched by Rabin-Karp algorithm and minimal perfect hash table;
+// 2. `substr` patterns are matched by ac automaton;
+// 3. `regex` patterns are matched with the regex library.
+type MphMatcherGroup struct {
+	ac            *ACAutomaton
+	otherMatchers []matcherEntry
+	rules         []string
+	level0        []uint32
+	level0Mask    int
+	level1        []uint32
+	level1Mask    int
+	count         uint32
+	ruleMap       *map[string]uint32
+}
+
+func (g *MphMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) {
+	h := RollingHash(pattern)
+	switch t {
+	case Domain:
+		(*g.ruleMap)["."+pattern] = h*PrimeRK + uint32('.')
+		fallthrough
+	case Full:
+		(*g.ruleMap)[pattern] = h
+	default:
+	}
+}
+
+func NewMphMatcherGroup() *MphMatcherGroup {
+	return &MphMatcherGroup{
+		ac:            nil,
+		otherMatchers: nil,
+		rules:         nil,
+		level0:        nil,
+		level0Mask:    0,
+		level1:        nil,
+		level1Mask:    0,
+		count:         1,
+		ruleMap:       &map[string]uint32{},
+	}
+}
+
+// AddPattern adds a pattern to MphMatcherGroup
+func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
+	switch t {
+	case Substr:
+		if g.ac == nil {
+			g.ac = NewACAutomaton()
+		}
+		g.ac.Add(pattern, t)
+	case Full, Domain:
+		pattern = strings.ToLower(pattern)
+		g.AddFullOrDomainPattern(pattern, t)
+	case Regex:
+		r, err := regexp.Compile(pattern)
+		if err != nil {
+			return 0, err
+		}
+		g.otherMatchers = append(g.otherMatchers, matcherEntry{
+			m:  &regexMatcher{pattern: r},
+			id: g.count,
+		})
+	default:
+		panic("Unknown type")
+	}
+	return g.count, nil
+}
+
+// Build builds a minimal perfect hash table and ac automaton from insert rules
+func (g *MphMatcherGroup) Build() {
+	if g.ac != nil {
+		g.ac.Build()
+	}
+	keyLen := len(*g.ruleMap)
+	if keyLen == 0 {
+		keyLen = 1
+		(*g.ruleMap)["empty___"] = RollingHash("empty___")
+	}
+	g.level0 = make([]uint32, nextPow2(keyLen/4))
+	g.level0Mask = len(g.level0) - 1
+	g.level1 = make([]uint32, nextPow2(keyLen))
+	g.level1Mask = len(g.level1) - 1
+	sparseBuckets := make([][]int, len(g.level0))
+	var ruleIdx int
+	for rule, hash := range *g.ruleMap {
+		n := int(hash) & g.level0Mask
+		g.rules = append(g.rules, rule)
+		sparseBuckets[n] = append(sparseBuckets[n], ruleIdx)
+		ruleIdx++
+	}
+	g.ruleMap = nil
+	var buckets []indexBucket
+	for n, vals := range sparseBuckets {
+		if len(vals) > 0 {
+			buckets = append(buckets, indexBucket{n, vals})
+		}
+	}
+	sort.Sort(bySize(buckets))
+
+	occ := make([]bool, len(g.level1))
+	var tmpOcc []int
+	for _, bucket := range buckets {
+		seed := uint32(0)
+		for {
+			findSeed := true
+			tmpOcc = tmpOcc[:0]
+			for _, i := range bucket.vals {
+				n := int(strhashFallback(unsafe.Pointer(&g.rules[i]), uintptr(seed))) & g.level1Mask
+				if occ[n] {
+					for _, n := range tmpOcc {
+						occ[n] = false
+					}
+					seed++
+					findSeed = false
+					break
+				}
+				occ[n] = true
+				tmpOcc = append(tmpOcc, n)
+				g.level1[n] = uint32(i)
+			}
+			if findSeed {
+				g.level0[bucket.n] = seed
+				break
+			}
+		}
+	}
+}
+
+func nextPow2(v int) int {
+	if v <= 1 {
+		return 1
+	}
+	const MaxUInt = ^uint(0)
+	n := (MaxUInt >> bits.LeadingZeros(uint(v))) + 1
+	return int(n)
+}
+
+// Lookup searches for s in t and returns its index and whether it was found.
+func (g *MphMatcherGroup) Lookup(h uint32, s string) bool {
+	i0 := int(h) & g.level0Mask
+	seed := g.level0[i0]
+	i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.level1Mask
+	n := g.level1[i1]
+	return s == g.rules[int(n)]
+}
+
+// Match implements IndexMatcher.Match.
+func (g *MphMatcherGroup) Match(pattern string) []uint32 {
+	result := []uint32{}
+	hash := uint32(0)
+	for i := len(pattern) - 1; i >= 0; i-- {
+		hash = hash*PrimeRK + uint32(pattern[i])
+		if pattern[i] == '.' {
+			if g.Lookup(hash, pattern[i:]) {
+				result = append(result, 1)
+				return result
+			}
+		}
+	}
+	if g.Lookup(hash, pattern) {
+		result = append(result, 1)
+		return result
+	}
+	if g.ac != nil && g.ac.Match(pattern) {
+		result = append(result, 1)
+		return result
+	}
+	for _, e := range g.otherMatchers {
+		if e.m.Match(pattern) {
+			result = append(result, e.id)
+			return result
+		}
+	}
+	return nil
+}
+
+type indexBucket struct {
+	n    int
+	vals []int
+}
+
+type bySize []indexBucket
+
+func (s bySize) Len() int           { return len(s) }
+func (s bySize) Less(i, j int) bool { return len(s[i].vals) > len(s[j].vals) }
+func (s bySize) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+
+type stringStruct struct {
+	str unsafe.Pointer
+	len int
+}
+
+func strhashFallback(a unsafe.Pointer, h uintptr) uintptr {
+	x := (*stringStruct)(a)
+	return memhashFallback(x.str, h, uintptr(x.len))
+}
+
+const (
+	// Constants for multiplication: four random odd 64-bit numbers.
+	m1 = 16877499708836156737
+	m2 = 2820277070424839065
+	m3 = 9497967016996688599
+	m4 = 15839092249703872147
+)
+
+var hashkey = [4]uintptr{1, 1, 1, 1}
+
+func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
+	h := uint64(seed + s*hashkey[0])
+tail:
+	switch {
+	case s == 0:
+	case s < 4:
+		h ^= uint64(*(*byte)(p))
+		h ^= uint64(*(*byte)(add(p, s>>1))) << 8
+		h ^= uint64(*(*byte)(add(p, s-1))) << 16
+		h = rotl31(h*m1) * m2
+	case s <= 8:
+		h ^= uint64(readUnaligned32(p))
+		h ^= uint64(readUnaligned32(add(p, s-4))) << 32
+		h = rotl31(h*m1) * m2
+	case s <= 16:
+		h ^= readUnaligned64(p)
+		h = rotl31(h*m1) * m2
+		h ^= readUnaligned64(add(p, s-8))
+		h = rotl31(h*m1) * m2
+	case s <= 32:
+		h ^= readUnaligned64(p)
+		h = rotl31(h*m1) * m2
+		h ^= readUnaligned64(add(p, 8))
+		h = rotl31(h*m1) * m2
+		h ^= readUnaligned64(add(p, s-16))
+		h = rotl31(h*m1) * m2
+		h ^= readUnaligned64(add(p, s-8))
+		h = rotl31(h*m1) * m2
+	default:
+		v1 := h
+		v2 := uint64(seed * hashkey[1])
+		v3 := uint64(seed * hashkey[2])
+		v4 := uint64(seed * hashkey[3])
+		for s >= 32 {
+			v1 ^= readUnaligned64(p)
+			v1 = rotl31(v1*m1) * m2
+			p = add(p, 8)
+			v2 ^= readUnaligned64(p)
+			v2 = rotl31(v2*m2) * m3
+			p = add(p, 8)
+			v3 ^= readUnaligned64(p)
+			v3 = rotl31(v3*m3) * m4
+			p = add(p, 8)
+			v4 ^= readUnaligned64(p)
+			v4 = rotl31(v4*m4) * m1
+			p = add(p, 8)
+			s -= 32
+		}
+		h = v1 ^ v2 ^ v3 ^ v4
+		goto tail
+	}
+
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
+func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(p) + x)
+}
+
+func readUnaligned32(p unsafe.Pointer) uint32 {
+	q := (*[4]byte)(p)
+	return uint32(q[0]) | uint32(q[1])<<8 | uint32(q[2])<<16 | uint32(q[3])<<24
+}
+
+func rotl31(x uint64) uint64 {
+	return (x << 31) | (x >> (64 - 31))
+}
+
+func readUnaligned64(p unsafe.Pointer) uint64 {
+	q := (*[8]byte)(p)
+	return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56
+}
--- a/component/geodata/strmatcher/package_info.go
+++ b/component/geodata/strmatcher/package_info.go
@ -0,0 +1,4 @@
+// Modified from: https://github.com/v2fly/v2ray-core/tree/master/common/strmatcher
+// License: MIT
+
+package strmatcher
--- a/component/geodata/strmatcher/strmatcher.go
+++ b/component/geodata/strmatcher/strmatcher.go
@ -0,0 +1,107 @@
+package strmatcher
+
+import (
+	"regexp"
+)
+
+// Matcher is the interface to determine a string matches a pattern.
+type Matcher interface {
+	// Match returns true if the given string matches a predefined pattern.
+	Match(string) bool
+	String() string
+}
+
+// Type is the type of the matcher.
+type Type byte
+
+const (
+	// Full is the type of matcher that the input string must exactly equal to the pattern.
+	Full Type = iota
+	// Substr is the type of matcher that the input string must contain the pattern as a sub-string.
+	Substr
+	// Domain is the type of matcher that the input string must be a sub-domain or itself of the pattern.
+	Domain
+	// Regex is the type of matcher that the input string must matches the regular-expression pattern.
+	Regex
+)
+
+// New creates a new Matcher based on the given pattern.
+func (t Type) New(pattern string) (Matcher, error) {
+	// 1. regex matching is case-sensitive
+	switch t {
+	case Full:
+		return fullMatcher(pattern), nil
+	case Substr:
+		return substrMatcher(pattern), nil
+	case Domain:
+		return domainMatcher(pattern), nil
+	case Regex:
+		r, err := regexp.Compile(pattern)
+		if err != nil {
+			return nil, err
+		}
+		return &regexMatcher{
+			pattern: r,
+		}, nil
+	default:
+		panic("Unknown type")
+	}
+}
+
+// IndexMatcher is the interface for matching with a group of matchers.
+type IndexMatcher interface {
+	// Match returns the index of a matcher that matches the input. It returns empty array if no such matcher exists.
+	Match(input string) []uint32
+}
+
+type matcherEntry struct {
+	m  Matcher
+	id uint32
+}
+
+// MatcherGroup is an implementation of IndexMatcher.
+// Empty initialization works.
+type MatcherGroup struct {
+	count         uint32
+	fullMatcher   FullMatcherGroup
+	domainMatcher DomainMatcherGroup
+	otherMatchers []matcherEntry
+}
+
+// Add adds a new Matcher into the MatcherGroup, and returns its index. The index will never be 0.
+func (g *MatcherGroup) Add(m Matcher) uint32 {
+	g.count++
+	c := g.count
+
+	switch tm := m.(type) {
+	case fullMatcher:
+		g.fullMatcher.addMatcher(tm, c)
+	case domainMatcher:
+		g.domainMatcher.addMatcher(tm, c)
+	default:
+		g.otherMatchers = append(g.otherMatchers, matcherEntry{
+			m:  m,
+			id: c,
+		})
+	}
+
+	return c
+}
+
+// Match implements IndexMatcher.Match.
+func (g *MatcherGroup) Match(pattern string) []uint32 {
+	result := []uint32{}
+	result = append(result, g.fullMatcher.Match(pattern)...)
+	result = append(result, g.domainMatcher.Match(pattern)...)
+	for _, e := range g.otherMatchers {
+		if e.m.Match(pattern) {
+			result = append(result, e.id)
+		}
+	}
+	return result
+}
+
+// Size returns the number of matchers in the MatcherGroup.
+func (g *MatcherGroup) Size() uint32 {
+	return g.count
+}