# # $Id: chinese-charset,v 1.1 2003/08/21 20:59:25 karsten Exp karsten $ # # High-bit characterset filter (Chinese characters). # KMSelf Sun Dec 9 14:11:10 PST 2001 # # See if the spam matches a chinese ruleset. # From: http://www3.sympatico.ca/walter.dnes/email/chinese/ # This requires that no more than 5% of message body text be # high-bit characters. # # To allow _more_ high-bit chars, *decrease* the weight for high-bit lines. # To allow _fewer high-bit chars, *increase* the weight for high-bit lines. # Weight is 1/(percent high-bit), e.g.: 1/(0.05) = 20. # Arbitrarally require message to be at least 3200 bytes to trip filter # (to exclude short messages w/funky sigs). This is about 4 lines of # text. :0BD * > 3200 * -1^1 . * 2^1 =[0-9A-F][0-9A-F] * 10^1 [ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿] * 10^1 [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] * 10^1 [àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ] * 10^1 =[A-F][0-9A-F] { :0c | ! chkmail --header "From|Sender" $WHITELIST :0a { LOG="(spam!: High-bit (e.g.: Asian) characterset)" # :0c # ! uce@ftc.gov # Train spamassassin :0c | sa-learn --spam --single :0: Spam-ricochet/ } } # ...or catch by subject (thanks to Alson van der Meulen ): :0D # Note: following matches [, if you're pasting, check it. * ^[Ss][Uu][Bb][Jj][Ee][Cc][Tt]:[ ]*\/[^ ]*$ * -1^1 MATCH ?? . * ! MATCH ?? ^Re: * 2^1 MATCH ?? =[0-9A-F][0-9A-F] * 6^1 MATCH ?? [ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿] * 6^1 MATCH ?? [ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß] * 6^1 MATCH ?? [àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ] { :0c | ! chkmail --header "From|Sender" $WHITELIST :0a { LOG="(spam!: High-bit (e.g.: Asian) characterset in subject)" # :0c # ! uce@ftc.gov # Train spamassassin :0c | sa-learn --spam --single :0: Spam-ricochet/ } } # ...or catch it by characterset: :0BD * -2^0 * 1^0 ^This is a multi-part message in MIME format. * 1^0 ^Content-Type: text/(plain|html) # Korean, Chinese, Korean, Chinese, Cyrillic, Turkish, Japanese * 1^0 ^[ ]+charset="ks_c_5601-1987" * 1^0 ^[ ]+charset="big5" * 1^0 ^[ ]+charset="euc-kr" * 1^0 ^[ ]+charset="gb2312" * 1^0 ^[ ]+charset="koi8-r" * 1^0 ^[ ]+charset="iso-8859-9" * 1^0 ^[ ]+charset="iso-2022-jp" * 1^0 ^[ ]+charset="gb2312" { :0c | ! chkmail --header "From|Sender" $WHITELIST :0a { LOG="(spam!: Asian charset encoding)" #:0c # ! uce@ftc.gov # Train spamassassin :0c | sa-learn --spam --single :0: Spam-ricochet/ } }