From owner-svn-ports-head@freebsd.org  Sat Oct 17 21:37:16 2015
Return-Path: <owner-svn-ports-head@freebsd.org>
Delivered-To: svn-ports-head@mailman.ysv.freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org
 [IPv6:2001:1900:2254:206a::19:1])
 by mailman.ysv.freebsd.org (Postfix) with ESMTP id 5C382A17A36;
 Sat, 17 Oct 2015 21:37:16 +0000 (UTC) (envelope-from pi@FreeBSD.org)
Received: from repo.freebsd.org (repo.freebsd.org
 [IPv6:2610:1c1:1:6068::e6a:0])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client did not present a certificate)
 by mx1.freebsd.org (Postfix) with ESMTPS id 23656156C;
 Sat, 17 Oct 2015 21:37:16 +0000 (UTC) (envelope-from pi@FreeBSD.org)
Received: from repo.freebsd.org ([127.0.1.37])
 by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id t9HLbFGs094930;
 Sat, 17 Oct 2015 21:37:15 GMT (envelope-from pi@FreeBSD.org)
Received: (from pi@localhost)
 by repo.freebsd.org (8.15.2/8.15.2/Submit) id t9HLbFV3094928;
 Sat, 17 Oct 2015 21:37:15 GMT (envelope-from pi@FreeBSD.org)
Message-Id: <201510172137.t9HLbFV3094928@repo.freebsd.org>
X-Authentication-Warning: repo.freebsd.org: pi set sender to pi@FreeBSD.org
 using -f
From: Kurt Jaeger <pi@FreeBSD.org>
Date: Sat, 17 Oct 2015 21:37:15 +0000 (UTC)
To: ports-committers@freebsd.org, svn-ports-all@freebsd.org,
 svn-ports-head@freebsd.org
Subject: svn commit: r399603 - in head/japanese/spamassassin: . files
X-SVN-Group: ports-head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-ports-head@freebsd.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: SVN commit messages for the ports tree for head
 <svn-ports-head.freebsd.org>
List-Unsubscribe: <https://lists.freebsd.org/mailman/options/svn-ports-head>, 
 <mailto:svn-ports-head-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-ports-head/>
List-Post: <mailto:svn-ports-head@freebsd.org>
List-Help: <mailto:svn-ports-head-request@freebsd.org?subject=help>
List-Subscribe: <https://lists.freebsd.org/mailman/listinfo/svn-ports-head>,
 <mailto:svn-ports-head-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Sat, 17 Oct 2015 21:37:16 -0000

Author: pi
Date: Sat Oct 17 21:37:14 2015
New Revision: 399603
URL: https://svnweb.freebsd.org/changeset/ports/399603

Log:
  japanese/spamassassin: Unbreak and adapt to 3.4.1
  
  PR:		203036
  Submitted by:	fmysh@iijmio-mail.jp

Modified:
  head/japanese/spamassassin/Makefile
  head/japanese/spamassassin/files/spamassassin-ja.patch

Modified: head/japanese/spamassassin/Makefile
==============================================================================
--- head/japanese/spamassassin/Makefile	Sat Oct 17 20:35:26 2015	(r399602)
+++ head/japanese/spamassassin/Makefile	Sat Oct 17 21:37:14 2015	(r399603)
@@ -1,7 +1,7 @@
 # Created by: TAOKA Fumiyoshi
 # $FreeBSD$
 
-PORTREVISION=	1
+PORTREVISION=	2
 CATEGORIES=	japanese mail perl5
 PKGNAMEPREFIX=	ja-
 
@@ -25,8 +25,6 @@ TOKENIZER_PRE=	tokenizer.pre
 
 PLIST_SUB+=	TOKENIZER_PRE=${TOKENIZER_PRE}
 
-BROKEN=		Requires update for 3.4.1
-
 pre-install:
 	@${CAT} ${EXTRA_PATCHES:S/.patch/.plist/} > ${PLIST}
 	@${CAT} ${PKGDIR}/pkg-plist >> ${PLIST}

Modified: head/japanese/spamassassin/files/spamassassin-ja.patch
==============================================================================
--- head/japanese/spamassassin/files/spamassassin-ja.patch	Sat Oct 17 20:35:26 2015	(r399602)
+++ head/japanese/spamassassin/files/spamassassin-ja.patch	Sat Oct 17 21:37:14 2015	(r399603)
@@ -1,105 +1,79 @@
---- lib/Mail/SpamAssassin/HTML.pm.orig	2014-02-07 17:36:28.000000000 +0900
-+++ lib/Mail/SpamAssassin/HTML.pm	2014-03-04 11:18:44.000000000 +0900
-@@ -86,7 +86,7 @@
- $ok_attributes{div}{$_} = 1 for qw( style );
- 
- sub new {
--  my ($class) = @_;
-+  my ($class, $opts) = @_;
-   my $self = $class->SUPER::new(
- 		api_version => 3,
- 		handlers => [
-@@ -99,6 +99,7 @@
- 			declaration => ["html_declaration", "self,text"],
- 		],
- 		marked_sections => 1);
-+  $self->{normalize} = $opts->{'normalize'} || 0;
- 
-   $self;
- }
-@@ -681,7 +682,14 @@
-     }
+--- lib/Mail/SpamAssassin/HTML.pm	2015-04-29 04:56:49.000000000 +0900
++++ lib/Mail/SpamAssassin/HTML.pm	2015-08-30 00:46:40.902000000 +0900
+@@ -695,7 +695,8 @@
    }
    else {
--    $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
-+    if ($self->{normalize}) {
-+      $text =~ s/\xc2\xa0/ /g;           # no-break space
-+      $text =~ s/\xe3\x80\x80/ /g;       # ideographicspace
-+      $text =~ s/[ \t\n\r\f\x0b]+/ /g;
-+    }
-+    else {
-+      $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
-+    }
+     # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0
+-    $text =~ s/[ \t\n\r\f\x0b]+|\xc2\xa0/ /gs;
++    # Ideographic Space: UTF-8: E3 80 80
++    $text =~ s/[ \t\n\r\f\x0b]+|(?:\xc2\xa0)+|(?:\xe3\x80\x80)+/ /gs;
      # trim leading whitespace if previous element was whitespace 
      # and current element is not invisible
      if (@{ $self->{text} } && !$display{invisible} &&
---- lib/Mail/SpamAssassin/Message/Node.pm.orig	2014-02-07 17:36:23.000000000 +0900
-+++ lib/Mail/SpamAssassin/Message/Node.pm	2014-03-04 11:22:38.000000000 +0900
-@@ -42,6 +42,7 @@
+@@ -742,7 +743,8 @@
+   my $invisible_for_bayes = 0;
+ 
+   # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0
+-  if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0)*\z/s) {
++  # Ideographic Space: UTF-8: E3 80 80
++  if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0|\xe3\x80\x80)*\z/s) {
+     $invisible_for_bayes = $self->html_font_invisible($text);
+   }
+ 
+--- lib/Mail/SpamAssassin/Message/Node.pm	2015-04-29 04:56:48.000000000 +0900
++++ lib/Mail/SpamAssassin/Message/Node.pm	2015-08-30 00:25:32.534000000 +0900
+@@ -44,6 +44,7 @@
  use Mail::SpamAssassin::Constants qw(:sa);
  use Mail::SpamAssassin::HTML;
  use Mail::SpamAssassin::Logger;
 +use Mail::SpamAssassin::Util::Charset;
  
- =item new()
+ our($enc_utf8, $enc_w1252, $have_encode_detector);
+ BEGIN {
+@@ -407,6 +408,10 @@
  
-@@ -385,27 +386,10 @@
+   return $_[1]  unless $self->{normalize} && $enc_utf8;
  
- sub _normalize {
-   my ($self, $data, $charset) = @_;
--  return $data unless $self->{normalize};
-+  return wantarray ? ($data, $charset) : $data unless $self->{normalize};
- 
--  my $detected = Encode::Detect::Detector::detect($data);
--
--  my $converter;
--
--  if ($charset && $charset !~ /^us-ascii$/i &&
--      ($detected || 'none') !~ /^(?:UTF|EUC|ISO-2022|Shift_JIS|Big5|GB)/i) {
--      dbg("message: Using labeled charset $charset");
--      $converter = Encode::find_encoding($charset);
--  }
--
--  $converter = Encode::find_encoding($detected) unless $converter || !defined($detected);
--
--  return $data unless $converter;
--
--  dbg("message: Converting...");
--
--  my $rv = $converter->decode($data, 0);
--  utf8::downgrade($rv, 1);
--  return $rv
-+  my ($decoded_data, $detected_charset) = normalize_charset($data, $charset);
-+  return wantarray ? ($decoded_data, $detected_charset) : $decoded_data;
- }
++  # FIXME: to be merged.
++  my ($decoded_data, $charset_detected) = normalize_charset($_[1], $charset_declared, $return_decoded);
++  return wantarray ? ($decoded_data, $charset_detected) : $decoded_data;
++
+   warn "message: _normalize() was given characters, expected bytes: $_[1]\n"
+     if utf8::is_utf8($_[1]);
  
- =item rendered()
-@@ -428,8 +412,12 @@
-     # text/x-aol is ignored here, but looks like text/html ...
-     return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
- 
--    my $text = $self->_normalize($self->decode(), $self->{charset});
-+    my ($text, $charset) = $self->_normalize($self->decode(), $self->{charset});
-     my $raw = length($text);
-+    if ($self->{normalize}) {
-+      $self->{charset} = $charset;
-+      $self->{language} = get_language($text, $charset);
-+    }
+@@ -603,6 +608,7 @@
+ 
+     my $text = $self->decode;  # QP and Base64 decoding, bytes
+     my $text_len = length($text);  # num of bytes in original charset encoding
++    my $charset;
  
      # render text/html always, or any other text|text/plain part as text/html
      # based on a heuristic which simulates a certain common mail client
-@@ -439,7 +427,7 @@
-     {
-       $self->{rendered_type} = 'text/html';
- 
--      my $html = Mail::SpamAssassin::HTML->new();	# object
-+      my $html = Mail::SpamAssassin::HTML->new({normalize=>$self->{normalize}});       # object
-       $html->parse($text);				# parse+render text
-       $self->{rendered} = $html->get_rendered_text();
-       $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
---- lib/Mail/SpamAssassin/Message.pm.orig	2014-02-07 17:36:28.000000000 +0900
-+++ lib/Mail/SpamAssassin/Message.pm	2014-03-04 11:27:31.000000000 +0900
-@@ -604,6 +604,8 @@
+@@ -622,7 +628,9 @@
+         # subroutine _normalize() to return Unicode text.  See Bug 7133
+         #
+         $character_semantics = 1;  # $text will be in characters
+-        $text = $self->_normalize($text, $self->{charset}, 1); # bytes to chars
++        ($text, $charset) = $self->_normalize($text, $self->{charset}, 1); # bytes to chars
++        $self->{charset} = $charset;
++        $self->{language} = get_language($text, $charset);
+       } elsif (!defined $self->{charset} ||
+                $self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) {
+         # With some luck input can be interpreted as UTF-8, do not warn.
+@@ -657,7 +665,9 @@
+     else {  # plain text
+       if ($self->{normalize} && $enc_utf8) {
+         # request transcoded result as UTF-8 octets!
+-        $text = $self->_normalize($text, $self->{charset}, 0);
++        ($text, $charset) = $self->_normalize($text, $self->{charset}, 0);
++        $self->{charset} = $charset;
++        $self->{language} = get_language($text, $charset);
+       }
+       $self->{rendered_type} = $self->{type};
+       $self->{rendered} = $self->{'visible_rendered'} = $text;
+--- lib/Mail/SpamAssassin/Message.pm	2015-04-29 04:56:49.000000000 +0900
++++ lib/Mail/SpamAssassin/Message.pm	2015-08-30 00:52:32.210000000 +0900
+@@ -627,6 +627,8 @@
    delete $self->{'pristine_headers'};
    delete $self->{'line_ending'};
    delete $self->{'missing_head_body_separator'};
@@ -108,7 +82,7 @@
  
    my @toclean = ( $self );
  
-@@ -630,6 +632,8 @@
+@@ -653,6 +655,8 @@
      delete $part->{'invisible_rendered'};
      delete $part->{'type'};
      delete $part->{'rendered_type'};
@@ -117,58 +91,21 @@
  
      # if there are children nodes, add them to the queue of nodes to clean up
      if (exists $part->{'body_parts'}) {
-@@ -1085,7 +1089,14 @@
- 
+@@ -1143,6 +1147,9 @@
    # whitespace handling (warning: small changes have large effects!)
    $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
--  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
+ # $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace (incl. VT, NBSP) => space
 +  if ($self->{normalize}) {
-+    $text =~ s/\xc2\xa0/ /g;		# no-break space => space
-+    $text =~ s/\xe3\x80\x80/ /g;	# ideographicspace => space
-+    $text =~ tr/ \t\n\r\x0b/ /s;	# whitespace => space
-+  }
-+  else {
-+    $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
++    $text =~ s/\xc2\xa0|\xe3\x80\x80/ /g; # whitespace (NBSP, ideographic space) => space
 +  }
+   $text =~ tr/ \t\n\r\x0b/ /s;		# whitespace (incl. VT) => space
    $text =~ tr/\f/\n/;			# form feeds => newline
-   
-   # warn "message: $text";
-@@ -1142,7 +1153,14 @@
- 
-   # whitespace handling (warning: small changes have large effects!)
-   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
--  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
-+  if ($self->{normalize}) {
-+    $text =~ s/\xc2\xa0/ /g;		# no-break space => space
-+    $text =~ s/\xe3\x80\x80/ /g;	# ideographicspace => space
-+    $text =~ tr/ \t\n\r\x0b/ /s;	# whitespace => space
-+  }
-+  else {
-+    $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
-+  }
-   $text =~ tr/\f/\n/;			# form feeds => newline
- 
-   my @textary = split_into_array_of_short_lines ($text);
-@@ -1193,7 +1211,14 @@
  
-   # whitespace handling (warning: small changes have large effects!)
-   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
--  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
-+  if ($self->{normalize}) {
-+    $text =~ s/\xc2\xa0/ /g;		# no-break space => space
-+    $text =~ s/\xe3\x80\x80/ /g;	# ideographicspace => space
-+    $text =~ tr/ \t\n\r\x0b/ /s;	# whitespace => space
-+  }
-+  else {
-+    $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace => space
-+  }
-   $text =~ tr/\f/\n/;			# form feeds => newline
- 
-   my @textary = split_into_array_of_short_lines ($text);
-@@ -1269,6 +1294,28 @@
+@@ -1235,6 +1242,27 @@
+ }
  
  # ---------------------------------------------------------------------------
- 
++
 +sub get_language {
 +  my ($self) = @_;
 +
@@ -189,39 +126,28 @@
 +}
 +
 +# ---------------------------------------------------------------------------
-+
-+
+ 
  1;
  
- =back
---- lib/Mail/SpamAssassin/PerMsgStatus.pm.orig	2014-02-07 17:36:28.000000000 +0900
-+++ lib/Mail/SpamAssassin/PerMsgStatus.pm	2014-03-04 11:30:25.000000000 +0900
-@@ -53,6 +53,7 @@
- use warnings;
- use re 'taint';
+--- lib/Mail/SpamAssassin/PerMsgStatus.pm	2015-04-29 04:56:49.000000000 +0900
++++ lib/Mail/SpamAssassin/PerMsgStatus.pm	2015-08-30 00:55:35.583000000 +0900
+@@ -55,6 +55,7 @@
  
-+use Encode;
  use Errno qw(ENOENT);
  use Time::HiRes qw(time);
++use Encode;
  
-@@ -996,19 +997,41 @@
- 
-   # the report charset
-   my $report_charset = "; charset=iso-8859-1";
--  if ($self->{conf}->{report_charset}) {
--    $report_charset = "; charset=" . $self->{conf}->{report_charset};
--  }
- 
+ use Mail::SpamAssassin::Constants qw(:sa);
+ use Mail::SpamAssassin::AsyncLoop;
+@@ -1053,12 +1054,32 @@
    # the SpamAssassin report
    my $report = $self->get_report();
-+  if ($self->{conf}->{report_charset}) {
-+    $report_charset = "; charset=" . $self->{conf}->{report_charset};
-+  }
  
-   # If there are any wide characters, need to MIME-encode in UTF-8
-   # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
-   # we could try converting to that charset if possible
+-  # If there are any wide characters, need to MIME-encode in UTF-8
+-  # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
+-  # we could try converting to that charset if possible
 -  unless ($] < 5.008 || utf8::downgrade($report, 1)) {
++  # decode to utf-8.
 +  my $is_utf8 = 0;
 +  if ($self->{conf}->{normalize_charset}) {
 +    $report = Encode::decode_utf8($report);
@@ -236,8 +162,8 @@
 +      };
 +    }
 +  }
++  # encode to report_charset. encode to utf-8 if charset conversion fail.
 +  if ($is_utf8) {
-+    $is_utf8 = 1;
 +    eval {
 +      my $scratch = $report;
 +      $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK);
@@ -251,14 +177,22 @@
    }
  
    # get original headers, "pristine" if we can do it
---- lib/Mail/SpamAssassin/Plugin/Bayes.pm.orig	2014-02-07 17:36:27.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm	2014-03-04 11:34:46.000000000 +0900
-@@ -223,6 +223,15 @@
+--- lib/Mail/SpamAssassin/Plugin/Bayes.pm	2015-04-29 04:56:47.000000000 +0900
++++ lib/Mail/SpamAssassin/Plugin/Bayes.pm	2015-08-30 00:25:43.443000000 +0900
+@@ -70,6 +70,7 @@
+   $MARK_PRESENCE_ONLY_HDRS
+   %HEADER_NAME_COMPRESSION
+   $OPPORTUNISTIC_LOCK_VALID
++  $SKIP_UTF8_SHORT_TOKENS_RE
+ };
+ 
+ # Which headers should we scan for tokens?  Don't use all of them, as it's easy
+@@ -226,6 +227,15 @@
  # will require a longer token than English ones.)
  use constant MAX_TOKEN_LENGTH => 15;
  
 +# Skip if a token is too short.
-+our $SKIP_UTF8_SHORT_TOKENS_RE = qr{(?:
++$SKIP_UTF8_SHORT_TOKENS_RE = qr{(?:
 +    [\x00-\x7F]                # 1 byte
 +  | [\xC0-\xDF][\x80-\xBF]     # 2 bytes
 +  | [\xE0-\xEF][\x80-\xBF]{2}  # 3 bytes
@@ -269,12 +203,12 @@
  ###########################################################################
  
  sub new {
-@@ -1039,9 +1048,28 @@
-   $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
-   $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
-   @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
+@@ -1048,9 +1058,28 @@
+     $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
+   @{$msgdata->{bayes_token_uris}} =
+     $pms->get_uri_list() if $t_src->{uri};
 +  if ($self->{conf}->{normalize_charset}) {
-+    my $tokenizer = $self->get_tokenizer($msg);
++    my $tokenizer = $self->get_tokenizer($pms);
 +    if (ref($tokenizer)) {
 +      $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body});
 +      $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz});
@@ -298,41 +232,30 @@
  ###########################################################################
  
  # The calling functions expect a uniq'ed array of tokens ...
-@@ -1095,7 +1123,7 @@
-   # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
-   # and ISO-8859-15 alphas.  Do not split on @'s; better results keeping it.
-   # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
--  tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
-+  tr/-A-Za-z0-9,\@\*\!_'"\$.\200-\377 / /cs;
- 
-   # DO split on "..." or "--" or "---"; common formatting error resulting in
-   # hapaxes.  Keep the separator itself as a token, though, as long ones can
-@@ -1124,6 +1152,11 @@
-     #
-     next if ( defined $magic_re && $token =~ /$magic_re/ );
+@@ -1192,6 +1221,11 @@
+     next if $len < 3 ||
+ 	($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
  
 +    # Skip short UTF-8 tokens.
 +    if ($self->{conf}->{normalize_charset}) {
 +      next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o);
 +    }
 +
-     # *do* keep 3-byte tokens; there's some solid signs in there
-     my $len = length($token);
+     # are we in the body?  If so, apply some body-specific breakouts
+     if ($region == 1 || $region == 2) {
+       if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
+@@ -1222,14 +1256,16 @@
+ 	}
+       }
  
-@@ -1152,14 +1185,16 @@
-     # the domain ".net" appeared in the To header.
-     #
-     if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
 -      if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
 -	# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
 -	# but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
 -	# to me! (jm)
 -	while ($token =~ s/^(..?)//) {
 -	  push (@rettokens, "8:$1");
--	}
--	next;
 +      unless ($self->{conf}->{normalize_charset}) {
-+        if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
++	if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
 +	  # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
 +	  # but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
 +	  # to me! (jm)
@@ -340,13 +263,13 @@
 +	    push (@rettokens, "8:$1");
 +	  }
 +	  next;
-+        }
+ 	}
+-	next;
        }
  
        if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
-diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm
---- /dev/null	1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm	2011-07-14 22:29:19.000000000 +0900
+--- lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm	1970-01-01 09:00:00.000000000 +0900
++++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm	2015-08-30 00:25:32.537000000 +0900
 @@ -0,0 +1,84 @@
 +# <@LICENSE>
 +# Copyright 2004 Apache Software Foundation
@@ -432,9 +355,8 @@ diff -uNr /dev/null lib/Mail/SpamAssassi
 +
 +1;
 +
-diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm
---- /dev/null	1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm	2011-07-14 22:29:19.000000000 +0900
+--- lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm	1970-01-01 09:00:00.000000000 +0900
++++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm	2015-08-30 00:25:32.538000000 +0900
 @@ -0,0 +1,111 @@
 +# <@LICENSE>
 +# Copyright 2004 Apache Software Foundation
@@ -547,9 +469,8 @@ diff -uNr /dev/null lib/Mail/SpamAssassi
 +
 +1;
 +
-diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer.pm
---- /dev/null	1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm	2011-07-14 22:35:46.000000000 +0900
+--- lib/Mail/SpamAssassin/Plugin/Tokenizer.pm	1970-01-01 09:00:00.000000000 +0900
++++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm	2015-08-30 00:25:32.538000000 +0900
 @@ -0,0 +1,115 @@
 +# <@LICENSE>
 +# Copyright 2004 Apache Software Foundation
@@ -666,10 +587,9 @@ diff -uNr /dev/null lib/Mail/SpamAssassi
 +
 +1;
 +
-diff -uNr /dev/null lib/Mail/SpamAssassin/Util/Charset.pm
---- /dev/null	1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Util/Charset.pm	2011-07-14 22:29:19.000000000 +0900
-@@ -0,0 +1,471 @@
+--- lib/Mail/SpamAssassin/Util/Charset.pm	1970-01-01 09:00:00.000000000 +0900
++++ lib/Mail/SpamAssassin/Util/Charset.pm	2015-08-30 00:25:32.539000000 +0900
+@@ -0,0 +1,473 @@
 +# <@LICENSE>
 +# Copyright 2006 Apache Software Foundation
 +#
@@ -959,6 +879,7 @@ diff -uNr /dev/null lib/Mail/SpamAssassi
 +  Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' );
 +  if (HAS_ENCODE_EUCJPMS) {
 +    Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' );
++    Encode::Alias::define_alias( qr/^euc-jp$/i => ' "cp51932"' );
 +  }
 +}
 +
@@ -998,6 +919,7 @@ diff -uNr /dev/null lib/Mail/SpamAssassi
 +sub normalize_charset {
 +  my $str = shift;
 +  my $charset = shift;
++  my $return_decoded = shift;
 +
 +  return wantarray ? ($str, 'ascii')  : $str unless ($str);
 +
@@ -1017,10 +939,10 @@ diff -uNr /dev/null lib/Mail/SpamAssassi
 +    return ($str, undef);
 +  }
 +  $decoded =~ s/^\x{feff}//g;
-+  $decoded = Encode::encode_utf8($decoded);
++  $decoded = Encode::encode_utf8($decoded) if $return_decoded;
 +
 +  # unfold hiragana, katakana and han
-+  if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949|CP50221$)/i) {
++  if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949$|CP50220|CP50221$)/i) {
 +    $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og;
 +  }
 +  return wantarray ? ($decoded, $detected) : $decoded;
@@ -1042,7 +964,7 @@ diff -uNr /dev/null lib/Mail/SpamAssassi
 +  return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
 +  return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
 +
-+  #$encoding = _get_alias($encoding);
++  $encoding = _get_alias($encoding);
 +  my $encoder = Encode::find_encoding($encoding);
 +  if (ref($encoder)) {
 +    $decoded = $encoder->decode($str,Encode::FB_QUIET);