Add rule optimization: domain coverage + CIDR aggregation

- domain entries covered by domain_suffix are removed
- domain/suffix entries covered by domain_keyword are removed
- child suffixes covered by parent suffix are removed
- adjacent/contained CIDRs are merged into larger blocks
- Available via --optimize/-O flag on merge and generate commands
- cn-direct: 634 -> 587 rules (-7.4%)
This commit is contained in:
NeoMody
2026-04-01 10:19:18 +08:00
parent 0f08d403fb
commit bb27f2073e
4 changed files with 418 additions and 7 deletions

View File

@@ -5,6 +5,7 @@ import (
"os"
"path/filepath"
"rulekit/internal/engine"
"rulekit/internal/model"
"rulekit/internal/writer"
"strings"
@@ -20,6 +21,7 @@ func init() {
}
generateCmd.Flags().StringP("format", "f", "singbox,clash,surge", "output formats (comma-separated: singbox,srs,clash,surge)")
generateCmd.Flags().StringP("output", "o", "", "output directory (default: rules_dir/output)")
generateCmd.Flags().BoolP("optimize", "O", false, "optimize: merge covered domains, aggregate CIDRs")
rootCmd.AddCommand(generateCmd)
}
@@ -27,6 +29,7 @@ func runGenerate(cmd *cobra.Command, args []string) error {
categoryName := args[0]
formatStr, _ := cmd.Flags().GetString("format")
outputDir, _ := cmd.Flags().GetString("output")
optimize, _ := cmd.Flags().GetBool("optimize")
cfg := loadConfig()
@@ -37,11 +40,24 @@ func runGenerate(cmd *cobra.Command, args []string) error {
return err
}
merged, err := engine.Merge(cfg, categoryName)
if err != nil {
return err
var merged *model.MergedRuleSet
if optimize {
var optResult *engine.OptimizeResult
var err error
merged, optResult, err = engine.MergeOptimized(cfg, categoryName)
if err != nil {
return err
}
fmt.Printf("Merged %s: %d rules (optimized %d -> %d)\n",
categoryName, len(merged.Rules), optResult.Before, optResult.After)
} else {
var err error
merged, err = engine.Merge(cfg, categoryName)
if err != nil {
return err
}
fmt.Printf("Merged %s: %d rules\n", categoryName, len(merged.Rules))
}
fmt.Printf("Merged %s: %d rules\n", categoryName, len(merged.Rules))
formats := strings.Split(formatStr, ",")
for _, fmtName := range formats {

View File

@@ -19,6 +19,7 @@ func init() {
}
mergeCmd.Flags().IntP("limit", "n", 0, "limit output rows (0 = all)")
mergeCmd.Flags().BoolP("stats", "s", false, "show statistics only")
mergeCmd.Flags().BoolP("optimize", "O", false, "optimize: merge covered domains, aggregate CIDRs")
rootCmd.AddCommand(mergeCmd)
}
@@ -26,12 +27,25 @@ func runMerge(cmd *cobra.Command, args []string) error {
categoryName := args[0]
limit, _ := cmd.Flags().GetInt("limit")
statsOnly, _ := cmd.Flags().GetBool("stats")
optimize, _ := cmd.Flags().GetBool("optimize")
cfg := loadConfig()
merged, err := engine.Merge(cfg, categoryName)
if err != nil {
return err
var merged *model.MergedRuleSet
var optResult *engine.OptimizeResult
if optimize {
var err error
merged, optResult, err = engine.MergeOptimized(cfg, categoryName)
if err != nil {
return err
}
} else {
var err error
merged, err = engine.Merge(cfg, categoryName)
if err != nil {
return err
}
}
// Stats
@@ -41,6 +55,11 @@ func runMerge(cmd *cobra.Command, args []string) error {
}
fmt.Printf("Merged: %s (%d rules)\n", merged.Name, len(merged.Rules))
if optResult != nil {
fmt.Printf("Optimized: %d -> %d (-%d domains by suffix, -%d by keyword, -%d CIDRs merged)\n",
optResult.Before, optResult.After,
optResult.DomainsMerged, optResult.KeywordMerged, optResult.CIDRsMerged)
}
for t, c := range types {
fmt.Printf(" %s: %d\n", t, c)
}

View File

@@ -60,6 +60,17 @@ func Merge(cfg *config.Config, categoryName string) (*model.MergedRuleSet, error
}, nil
}
// MergeOptimized merges and then optimizes (domain/suffix/CIDR dedup).
func MergeOptimized(cfg *config.Config, categoryName string) (*model.MergedRuleSet, *OptimizeResult, error) {
merged, err := Merge(cfg, categoryName)
if err != nil {
return nil, nil, err
}
optimized, result := Optimize(merged.Rules)
merged.Rules = optimized
return merged, result, nil
}
func typeOrder(t model.RuleType) int {
switch t {
case model.RuleDomain:

365
internal/engine/optimize.go Normal file
View File

@@ -0,0 +1,365 @@
package engine
import (
"encoding/binary"
"fmt"
"net"
"rulekit/internal/model"
"sort"
"strings"
)
// OptimizeResult tracks what the optimizer did.
type OptimizeResult struct {
Before int
After int
DomainsMerged int // domain entries removed because a domain_suffix covers them
KeywordMerged int // domain/suffix entries removed because a keyword covers them
CIDRsMerged int // CIDR entries merged into larger blocks
Removals []string // human-readable list of what was removed/merged
}
// Optimize performs semantic deduplication on a merged rule set:
// 1. domain_suffix subsumes matching domain entries
// 2. domain_keyword subsumes matching domain/suffix entries
// 3. CIDR aggregation: merge adjacent/contained IP ranges
func Optimize(rules []model.Rule) ([]model.Rule, *OptimizeResult) {
result := &OptimizeResult{Before: len(rules)}
// Separate by type
var domains, suffixes, keywords, regexes, cidrs, procs []model.Rule
for _, r := range rules {
switch r.Type {
case model.RuleDomain:
domains = append(domains, r)
case model.RuleDomainSuffix:
suffixes = append(suffixes, r)
case model.RuleDomainKeyword:
keywords = append(keywords, r)
case model.RuleDomainRegex:
regexes = append(regexes, r)
case model.RuleIPCIDR:
cidrs = append(cidrs, r)
case model.RuleProcessName:
procs = append(procs, r)
}
}
// Build suffix set for fast lookup
suffixSet := map[string]bool{}
for _, s := range suffixes {
suffixSet[s.Value] = true
}
// Build keyword list
keywordVals := make([]string, len(keywords))
for i, k := range keywords {
keywordVals[i] = k.Value
}
// 1. Remove domains covered by domain_suffix
var filteredDomains []model.Rule
for _, d := range domains {
if coveredBySuffix(d.Value, suffixSet) {
result.DomainsMerged++
result.Removals = append(result.Removals,
fmt.Sprintf("domain:%s (covered by suffix)", d.Value))
} else {
filteredDomains = append(filteredDomains, d)
}
}
// 2. Remove domains/suffixes covered by keyword
var filteredDomains2 []model.Rule
for _, d := range filteredDomains {
if coveredByKeyword(d.Value, keywordVals) {
result.KeywordMerged++
result.Removals = append(result.Removals,
fmt.Sprintf("domain:%s (covered by keyword)", d.Value))
} else {
filteredDomains2 = append(filteredDomains2, d)
}
}
var filteredSuffixes []model.Rule
for _, s := range suffixes {
if coveredByKeyword(s.Value, keywordVals) {
result.KeywordMerged++
result.Removals = append(result.Removals,
fmt.Sprintf("domain_suffix:%s (covered by keyword)", s.Value))
} else {
filteredSuffixes = append(filteredSuffixes, s)
}
}
// 2b. Remove suffixes covered by a parent suffix
// e.g., "a.bilibili.com" is redundant if "bilibili.com" exists
filteredSuffixes = removeCoveredSuffixes(filteredSuffixes, result)
// 3. CIDR aggregation
optimizedCIDRs := aggregateCIDRs(cidrs, result)
// Reassemble
var out []model.Rule
out = append(out, filteredDomains2...)
out = append(out, filteredSuffixes...)
out = append(out, keywords...)
out = append(out, regexes...)
out = append(out, optimizedCIDRs...)
out = append(out, procs...)
sort.Slice(out, func(i, j int) bool {
if out[i].Type != out[j].Type {
return typeOrder(out[i].Type) < typeOrder(out[j].Type)
}
return out[i].Value < out[j].Value
})
result.After = len(out)
return out, result
}
// coveredBySuffix checks if a domain is matched by any suffix in the set.
// e.g., "www.bilibili.com" is covered by suffix "bilibili.com"
func coveredBySuffix(domain string, suffixSet map[string]bool) bool {
// Check exact match first
if suffixSet[domain] {
return true
}
// Walk up the domain tree
parts := strings.Split(domain, ".")
for i := 1; i < len(parts); i++ {
parent := strings.Join(parts[i:], ".")
if suffixSet[parent] {
return true
}
}
return false
}
// coveredByKeyword checks if a domain/suffix contains any keyword.
func coveredByKeyword(value string, keywords []string) bool {
for _, kw := range keywords {
if strings.Contains(value, kw) {
return true
}
}
return false
}
// removeCoveredSuffixes removes suffixes that are subdomains of other suffixes.
// e.g., "api.bilibili.com" is redundant if "bilibili.com" exists as a suffix.
func removeCoveredSuffixes(suffixes []model.Rule, result *OptimizeResult) []model.Rule {
suffixSet := map[string]bool{}
for _, s := range suffixes {
suffixSet[s.Value] = true
}
var filtered []model.Rule
for _, s := range suffixes {
parts := strings.Split(s.Value, ".")
covered := false
for i := 1; i < len(parts); i++ {
parent := strings.Join(parts[i:], ".")
if suffixSet[parent] {
covered = true
result.DomainsMerged++
result.Removals = append(result.Removals,
fmt.Sprintf("domain_suffix:%s (covered by suffix:%s)", s.Value, parent))
break
}
}
if !covered {
filtered = append(filtered, s)
}
}
return filtered
}
// aggregateCIDRs merges adjacent and contained CIDR blocks.
func aggregateCIDRs(cidrs []model.Rule, result *OptimizeResult) []model.Rule {
if len(cidrs) == 0 {
return nil
}
// Separate IPv4 and IPv6
var v4nets, v6nets []*net.IPNet
cidrSource := map[string]model.Rule{} // keep first rule for metadata
for _, r := range cidrs {
_, ipnet, err := net.ParseCIDR(r.Value)
if err != nil {
continue
}
key := ipnet.String()
if _, exists := cidrSource[key]; !exists {
cidrSource[key] = r
}
if ipnet.IP.To4() != nil {
v4nets = append(v4nets, ipnet)
} else {
v6nets = append(v6nets, ipnet)
}
}
// Remove contained CIDRs and merge adjacent
v4merged := mergeCIDRList(v4nets)
v6merged := mergeCIDRList(v6nets)
mergedCount := (len(v4nets) + len(v6nets)) - (len(v4merged) + len(v6merged))
result.CIDRsMerged = mergedCount
var out []model.Rule
for _, n := range v4merged {
cidrStr := n.String()
if r, ok := cidrSource[cidrStr]; ok {
out = append(out, r)
} else {
out = append(out, model.Rule{
Type: model.RuleIPCIDR,
Value: cidrStr,
Source: "optimized",
})
}
}
for _, n := range v6merged {
cidrStr := n.String()
if r, ok := cidrSource[cidrStr]; ok {
out = append(out, r)
} else {
out = append(out, model.Rule{
Type: model.RuleIPCIDR,
Value: cidrStr,
Source: "optimized",
})
}
}
return out
}
// mergeCIDRList removes contained CIDRs and merges adjacent ones.
func mergeCIDRList(nets []*net.IPNet) []*net.IPNet {
if len(nets) == 0 {
return nil
}
// Sort by IP then prefix length
sort.Slice(nets, func(i, j int) bool {
cmp := compareIPs(nets[i].IP, nets[j].IP)
if cmp != 0 {
return cmp < 0
}
iOnes, _ := nets[i].Mask.Size()
jOnes, _ := nets[j].Mask.Size()
return iOnes < jOnes // shorter prefix (larger range) first
})
// Remove contained
var deduped []*net.IPNet
for _, n := range nets {
contained := false
for _, existing := range deduped {
if existing.Contains(n.IP) {
onesE, _ := existing.Mask.Size()
onesN, _ := n.Mask.Size()
if onesE <= onesN { // existing has equal or larger range
contained = true
break
}
}
}
if !contained {
deduped = append(deduped, n)
}
}
// Try merging adjacent pairs
changed := true
for changed {
changed = false
var merged []*net.IPNet
skip := map[int]bool{}
for i := 0; i < len(deduped); i++ {
if skip[i] {
continue
}
didMerge := false
for j := i + 1; j < len(deduped); j++ {
if skip[j] {
continue
}
if combined := tryCombine(deduped[i], deduped[j]); combined != nil {
merged = append(merged, combined)
skip[i] = true
skip[j] = true
changed = true
didMerge = true
break
}
}
if !didMerge && !skip[i] {
merged = append(merged, deduped[i])
}
}
deduped = merged
}
return deduped
}
// tryCombine tries to merge two adjacent CIDRs into one.
// e.g., 1.0.0.0/24 + 1.0.1.0/24 = 1.0.0.0/23
func tryCombine(a, b *net.IPNet) *net.IPNet {
onesA, bitsA := a.Mask.Size()
onesB, bitsB := b.Mask.Size()
if onesA != onesB || bitsA != bitsB {
return nil
}
if onesA == 0 {
return nil
}
// The parent prefix is one bit shorter
parentOnes := onesA - 1
parentMask := net.CIDRMask(parentOnes, bitsA)
// Check if both belong to the same parent
parentA := a.IP.Mask(parentMask)
parentB := b.IP.Mask(parentMask)
if !parentA.Equal(parentB) {
return nil
}
return &net.IPNet{
IP: parentA,
Mask: parentMask,
}
}
func compareIPs(a, b net.IP) int {
a4 := a.To4()
b4 := b.To4()
if a4 != nil && b4 != nil {
ai := binary.BigEndian.Uint32(a4)
bi := binary.BigEndian.Uint32(b4)
if ai < bi {
return -1
}
if ai > bi {
return 1
}
return 0
}
a16 := a.To16()
b16 := b.To16()
for i := 0; i < 16; i++ {
if a16[i] < b16[i] {
return -1
}
if a16[i] > b16[i] {
return 1
}
}
return 0
}