From 8adf03603c97170c52ed5e2385e2c0d55b6462b0 Mon Sep 17 00:00:00 2001 From: bol-van Date: Fri, 5 Nov 2021 00:11:05 +0300 Subject: [PATCH] ipset: use awk instead of grep to avoid wrong ip matches --- ipset/create_ipset.sh | 4 ++-- ipset/def.sh | 19 ++++++++++++------- ipset/get_reestr_combined.sh | 9 +++++++-- ipset/get_reestr_hostlist.sh | 2 +- ipset/get_reestr_ip.sh | 9 +++++++-- 5 files changed, 29 insertions(+), 14 deletions(-) diff --git a/ipset/create_ipset.sh b/ipset/create_ipset.sh index d0281f3..14ee16c 100755 --- a/ipset/create_ipset.sh +++ b/ipset/create_ipset.sh @@ -26,7 +26,7 @@ file_extract_lines() # $2 - from line (starting with 0) # $3 - line count # awk "{ err=1 } NR < $(($2+1)) { next } { print; err=0 } NR == $(($2+$3)) { exit err } END {exit err}" "$1" - awk "NR < $(($2+1)) { next } { print } NR == $(($2+$3)) { exit }" "$1" + $AWK "NR < $(($2+1)) { next } { print } NR == $(($2+$3)) { exit }" "$1" } ipset_restore_chunked() { @@ -160,7 +160,7 @@ elif exists ipset; then # only /tmp is considered tmpfs. other locations mean tmpdir was redirected to a disk SAVERAM=0 [ "$TMPDIR" = "/tmp" ] && { - RAMSIZE=$($GREP MemTotal /proc/meminfo | awk '{print $2}') + RAMSIZE=$($GREP MemTotal /proc/meminfo | $AWK '{print $2}') [ "$RAMSIZE" -lt "110000" ] && SAVERAM=1 } print_reloading_backend ipset diff --git a/ipset/def.sh b/ipset/def.sh index 6cdae24..892694f 100644 --- a/ipset/def.sh +++ b/ipset/def.sh @@ -60,6 +60,12 @@ else GREP=$(which grep) fi +# GNU awk is faster +if exists gawk; then + AWK=gawk +else + AWK=awk +fi grep_supports_b() { @@ -68,17 +74,16 @@ grep_supports_b() } get_ip_regex() { - REG_IPV4='((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(/([0-9]|[12][0-9]|3[012]))?' - REG_IPV6='[0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}|:)+(/([0-9][0-9]?|1[01][0-9]|12[0-8]))?' + REG_IPV4='((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\/([0-9]|[12][0-9]|3[012]))?' + REG_IPV6='[0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}|:)+(\/([0-9][0-9]?|1[01][0-9]|12[0-8]))?' # good but too slow # REG_IPV6='([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}(/[0-9]+)?|([0-9a-fA-F]{1,4}:){1,7}:(/[0-9]+)?|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}(/[0-9]+)?|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}(/[0-9]+)?|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}(/[0-9]+)?|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}(/[0-9]+)?|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}(/[0-9]+)?|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})(/[0-9]+)?|:((:[0-9a-fA-F]{1,4}){1,7}|:)(/([0-9][0-9]?|1[01][0-9]|12[0-8]))?' - grep_supports_b && { - REG_IPV4="\b$REG_IPV4\b" - REG_IPV6="\b$REG_IPV6\b" - } +# grep_supports_b && { +# REG_IPV4="\b$REG_IPV4\b" +# REG_IPV6="\b$REG_IPV6\b" +# } } - ip2net4() { if [ -x "$IP2NET" ]; then diff --git a/ipset/get_reestr_combined.sh b/ipset/get_reestr_combined.sh index 1a5569e..354e6a5 100755 --- a/ipset/get_reestr_combined.sh +++ b/ipset/get_reestr_combined.sh @@ -9,6 +9,11 @@ ZREESTR="$TMPDIR/reestr.txt" #ZURL_REESTR=https://reestr.rublacklist.net/api/current ZURL_REESTR=https://raw.githubusercontent.com/zapret-info/z-i/master/dump.csv +awkgrep() +{ + # $1 - pattern + nice -n 5 $AWK "{while ( match(\$0,/($1[ |;])/) ) { print substr(\$0,RSTART,RLENGTH-1); \$0=substr(\$0,RSTART+RLENGTH) } }" +} dig_reestr() { @@ -25,12 +30,12 @@ dig_reestr() # find entries with https or without domain name - they should be banned by IP # 2971-18 is TELEGRAM. lots of proxy IPs banned, list grows very large (nice -n 5 $GREP -avE "$DOMMASK" "$ZREESTR" ; $GREP -a "https://" "$ZREESTR") | - nice -n 5 $GREP -oE "$1" | cut_local | sort -u >$TMP + awkgrep "$1" | cut_local | sort -u >$TMP ip2net$4 <"$TMP" | zz "$3" # other IPs go to regular zapret list - tail -n +2 "$ZREESTR" | nice -n 5 $GREP -oE "$1" | cut_local | nice -n 5 $GREP -xvFf "$TMP" | ip2net$4 | zz "$2" + tail -n +2 "$ZREESTR" | awkgrep "$1" | cut_local | nice -n 5 $GREP -xvFf "$TMP" | ip2net$4 | zz "$2" rm -f "$TMP" } diff --git a/ipset/get_reestr_hostlist.sh b/ipset/get_reestr_hostlist.sh index 3900434..1c73751 100755 --- a/ipset/get_reestr_hostlist.sh +++ b/ipset/get_reestr_hostlist.sh @@ -25,7 +25,7 @@ if test $dlsize -lt 204800; then echo list file is too small. can be bad. exit 2 fi -(LANG=C cut -s -f2 -d';' "$ZREESTR" | LANG=C sed -Ee 's/^\*\.(.+)$/\1/' -ne 's/^[a-z0-9A-Z._-]+$/&/p' | awk '{ print tolower($0) }' ; cat "$ZUSERLIST" ) | sort -u | zz "$ZHOSTLIST" +(LANG=C cut -s -f2 -d';' "$ZREESTR" | LANG=C sed -Ee 's/^\*\.(.+)$/\1/' -ne 's/^[a-z0-9A-Z._-]+$/&/p' | $AWK '{ print tolower($0) }' ; cat "$ZUSERLIST" ) | sort -u | zz "$ZHOSTLIST" rm -f "$ZREESTR" hup_zapret_daemons diff --git a/ipset/get_reestr_ip.sh b/ipset/get_reestr_ip.sh index 7ab8f27..1e456ee 100755 --- a/ipset/get_reestr_ip.sh +++ b/ipset/get_reestr_ip.sh @@ -10,6 +10,12 @@ ZREESTR="$TMPDIR/reestr.txt" ZURL_REESTR=https://raw.githubusercontent.com/zapret-info/z-i/master/dump.csv +awkgrep() +{ + # $1 - pattern + nice -n 5 $AWK "{while ( match(\$0,/($1[ |;])/) ) { print substr(\$0,RSTART,RLENGTH-1); \$0=substr(\$0,RSTART+RLENGTH) } }" +} + dig_reestr() { # $1 - grep ipmask @@ -18,10 +24,9 @@ dig_reestr() echo processing reestr list $2 - tail -n +2 "$ZREESTR" | nice -n 5 $GREP -oE "$1" | cut_local | ip2net$3 | zz "$2" + tail -n +2 "$ZREESTR" | awkgrep "$1" | cut_local | ip2net$3 | zz "$2" } - getuser && { # assume all https banned by ip curl -k --fail --max-time 600 --connect-timeout 5 --retry 3 --max-filesize 251658240 "$ZURL_REESTR" -o "$ZREESTR" ||