diff --git a/Makefile.PL b/Makefile.PL index 59c5d86..a346b17 100755 --- a/Makefile.PL +++ b/Makefile.PL @@ -19,14 +19,17 @@ extra_build: --plugin=orphans --plugin=haiku --plugin=meta ./mdwn2man ikiwiki 1 doc/usage.mdwn > ikiwiki.man ./mdwn2man ikiwiki-mass-rebuild 8 doc/ikiwiki-mass-rebuild.mdwn > ikiwiki-mass-rebuild.man + cd l10n && $(MAKE) install extra_clean: rm -rf html doc/.ikiwiki rm -f ikiwiki.man ikiwiki-mass-rebuild.man + cd l10n && $(MAKE) distclean extra_install: - install -d $(PREFIX)/share/ikiwiki/templates - cp templates/* $(PREFIX)/share/ikiwiki/templates + install -d $(PREFIX)/share/ikiwiki/templates/en + cp templates/* $(PREFIX)/share/ikiwiki/templates/en + cp -a translations/* $(PREFIX)/share/ikiwiki/templates/ || true install -d $(PREFIX)/share/ikiwiki/basewiki cp -a basewiki/* $(PREFIX)/share/ikiwiki/basewiki diff --git a/l10n/Makefile b/l10n/Makefile new file mode 100644 index 0000000..ee1311b --- /dev/null +++ b/l10n/Makefile @@ -0,0 +1,38 @@ +TMPL_PROCESS = ./tmpl_process3.pl + +SOURCE_DIR = ../templates +TARGET_DIR = ../translations +PO_DIR = ../po +POT_FILE = $(PO_DIR)/templates.pot + +LANGUAGES = $(patsubst $(PO_DIR)/%.po, $(TARGET_DIR)/%, $(wildcard $(PO_DIR)/*.po)) + +all: install + +pot: $(POT_FILE) + +update: pot $(wildcard $(PO_DIR)/*.po) + +install: $(LANGUAGES) + +clean: + rm -rf $(PO_DIR)/*~ $(LANGUAGES) + +distclean: clean + rmdir --ignore-fail-on-non-empty $(TARGET_DIR) + +%.pot: $(wildcard $(SOURCE_DIR)/*.tmpl) + mkdir -p $(dir $@) + rm -f $@ + $(TMPL_PROCESS) create -s $@ -r -i $(SOURCE_DIR) + +%.po: $(wildcard $(SOURCE_DIR)/*.tmpl) + $(TMPL_PROCESS) update -s $@ -r -i $(SOURCE_DIR) + touch $@ # TMPL_PROCESS doesn't touch file if not needed + +$(TARGET_DIR)/%: $(PO_DIR)/%.po + mkdir -p $@ + $(TMPL_PROCESS) install -s $< -o $@ -r -i $(SOURCE_DIR) + touch $@ # TMPL_PROCESS doesn't touch dir if not needed + +.PHONY: update pot diff --git a/l10n/TmplToken.pm b/l10n/TmplToken.pm new file mode 100644 index 0000000..0a54d0b --- /dev/null +++ b/l10n/TmplToken.pm @@ -0,0 +1,157 @@ +package TmplToken; + +use strict; +use TmplTokenType; +require Exporter; + +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); + +############################################################################### + +=head1 NAME + +TmplToken.pm - Object representing a scanner token for .tmpl files + +=head1 DESCRIPTION + +This is a class representing a token scanned from an HTML::Template .tmpl file. + +=cut + +############################################################################### + +$VERSION = 0.01; + +@ISA = qw(Exporter); +@EXPORT_OK = qw(); + +############################################################################### + +sub new { + my $this = shift; + my $class = ref($this) || $this; + my $self = {}; + bless $self, $class; + ($self->{'_string'}, $self->{'_type'}, $self->{'_lc'}, $self->{'_path'}) = @_; + return $self; +} + +sub string { + my $this = shift; + return $this->{'_string'} +} + +sub type { + my $this = shift; + return $this->{'_type'} +} + +sub pathname { + my $this = shift; + return $this->{'_path'} +} + +sub line_number { + my $this = shift; + return $this->{'_lc'} +} + +sub attributes { + my $this = shift; + return $this->{'_attr'}; +} + +sub set_attributes { + my $this = shift; + $this->{'_attr'} = ref $_[0] eq 'HASH'? $_[0]: \@_; + return $this; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +sub children { + my $this = shift; + return $this->{'_kids'}; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +sub set_children { + my $this = shift; + $this->{'_kids'} = ref $_[0] eq 'ARRAY'? $_[0]: \@_; + return $this; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +# FIXME: DIRECTIVE is not necessarily TMPL_VAR !! +sub parameters_and_fields { + my $this = shift; + return map { $_->type == TmplTokenType::DIRECTIVE? $_: + ($_->type == TmplTokenType::TAG + && $_->string =~ /^{'_kids'}}; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +sub anchors { + my $this = shift; + return map { $_->type == TmplTokenType::TAG && $_->string =~ /^{'_kids'}}; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +sub form { + my $this = shift; + return $this->{'_form'}; +} + +# only meaningful for TEXT_PARAMETRIZED tokens +sub set_form { + my $this = shift; + $this->{'_form'} = $_[0]; + return $this; +} + +sub has_js_data { + my $this = shift; + return defined $this->{'_js_data'} && ref($this->{'_js_data'}) eq 'ARRAY'; +} + +sub js_data { + my $this = shift; + return $this->{'_js_data'}; +} + +sub set_js_data { + my $this = shift; + $this->{'_js_data'} = $_[0]; + return $this; +} + +# predefined tests + +sub tag_p { + my $this = shift; + return $this->type == TmplTokenType::TAG; +} + +sub cdata_p { + my $this = shift; + return $this->type == TmplTokenType::CDATA; +} + +sub text_p { + my $this = shift; + return $this->type == TmplTokenType::TEXT; +} + +sub text_parametrized_p { + my $this = shift; + return $this->type == TmplTokenType::TEXT_PARAMETRIZED; +} + +sub directive_p { + my $this = shift; + return $this->type == TmplTokenType::DIRECTIVE; +} + +############################################################################### + +1; diff --git a/l10n/TmplTokenType.pm b/l10n/TmplTokenType.pm new file mode 100644 index 0000000..75d4f52 --- /dev/null +++ b/l10n/TmplTokenType.pm @@ -0,0 +1,128 @@ +package TmplTokenType; + +use strict; +require Exporter; + +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); + +############################################################################### + +=head1 NAME + +TmplTokenType.pm - Types of TmplToken objects + +=head1 DESCRIPTION + +This is a Java-style "safe enum" singleton class for types of TmplToken objects. +The predefined constants are + +=cut + +############################################################################### + +$VERSION = 0.01; + +@ISA = qw(Exporter); +@EXPORT_OK = qw( + &TEXT + &TEXT_PARAMETRIZED + &CDATA + &TAG + &DECL + &PI + &DIRECTIVE + &COMMENT + &UNKNOWN +); + +############################################################################### + +use vars qw( $_text $_text_parametrized $_cdata + $_tag $_decl $_pi $_directive $_comment $_null $_unknown ); + +BEGIN { + my $new = sub { + my $this = 'TmplTokenType';#shift; + my $class = ref($this) || $this; + my $self = {}; + bless $self, $class; + ($self->{'id'}, $self->{'name'}, $self->{'desc'}) = @_; + return $self; + }; + $_text = &$new(0, 'TEXT'); + $_text_parametrized = &$new(8, 'TEXT-PARAMETRIZED'); + $_cdata = &$new(1, 'CDATA'); + $_tag = &$new(2, 'TAG'); + $_decl = &$new(3, 'DECL'); + $_pi = &$new(4, 'PI'); + $_directive = &$new(5, 'DIRECTIVE'); + $_comment = &$new(6, 'COMMENT'); + $_unknown = &$new(7, 'UNKNOWN'); +} + +sub to_string { + my $this = shift; + return $this->{'name'} +} + +sub TEXT () { $_text } +sub TEXT_PARAMETRIZED () { $_text_parametrized } +sub CDATA () { $_cdata } +sub TAG () { $_tag } +sub DECL () { $_decl } +sub PI () { $_pi } +sub DIRECTIVE () { $_directive } +sub COMMENT () { $_comment } +sub UNKNOWN () { $_unknown } + +############################################################################### + +=over + +=item TEXT + +normal text (#text in the DTD) + +=item TEXT_PARAMETRIZED + +parametrized normal text +(result of simple recognition of text interspersed with directives; +this has to be explicitly enabled in the scanner) + +=item CDATA + +normal text (CDATA in the DTD) + +=item TAG + +something that has the form of an HTML tag + +=item DECL + +something that has the form of an SGML declaration + +=item PI + +something that has the form of an SGML processing instruction + +=item DIRECTIVE + +a HTML::Template directive (whether or not embedded in an SGML comment) + +=item COMMENT + +something that has the form of an HTML comment +(and is not recognized as an HTML::Template directive) + +=item UNKNOWN + +something that is not recognized at all by the scanner + +=back + +Note that end of file is currently represented by undef, +instead of a constant predefined by this module. + +=cut + +1; diff --git a/l10n/TmplTokenizer.pm b/l10n/TmplTokenizer.pm new file mode 100644 index 0000000..30d011d --- /dev/null +++ b/l10n/TmplTokenizer.pm @@ -0,0 +1,1150 @@ +package TmplTokenizer; + +use strict; +use TmplTokenType; +use TmplToken; +use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic ); +require Exporter; + +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); + +############################################################################### + +=head1 NAME + +TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files + +=head1 DESCRIPTION + +Because .tmpl files contains HTML::Template directives +that tend to confuse real parsers (e.g., HTML::Parse), +it might be better to create a customized scanner +to scan the template files for tokens. +This module is a simple-minded attempt at such a scanner. + +=cut + +############################################################################### + +$VERSION = 0.02; + +@ISA = qw(Exporter); +@EXPORT_OK = qw(); + +use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p ); +use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p ); +use vars qw( $pedantic_error_markup_in_pcdata_p ); + +############################################################################### + +# Hideous stuff +use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include ); +use vars qw( $re_directive_control $re_tmpl_endif_endloop ); +BEGIN { + # $re_directive must not do any backreferences + $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; + # TMPL_VAR or TMPL_INCLUDE + $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; + $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; + # TMPL_VAR ESCAPE=1/HTML/URL + $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; + # Any control flow directive + $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; + # /LOOP or /IF or /UNLESS + $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; +} + +# Hideous stuff from subst.pl, slightly modified to use the above hideous stuff +# Note: The $re_tag's set $1 (), and $3 (rest of string) +use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag ); +use vars qw( $re_tag_strict $re_tag_compat @re_tag ); +sub re_tag ($) { + my($compat) = @_; + my $etag = $compat? '>': '<>\/'; + # This is no longer similar to the original regexp in subst.pl :-( + # Note that we don't want <> in compat mode; Mozilla knows about < + q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:} + . $re_directive + . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)}; +} +BEGIN { + $re_comment = '(?:--(?:[^-]|-[^-])*--)'; + $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML + $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace + $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag + @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1)); +} + +# End of the hideous stuff + +use vars qw( $serial ); + +############################################################################### + +sub FATAL_P () {'fatal-p'} +sub SYNTAXERROR_P () {'syntaxerror-p'} + +sub FILENAME () {'input'} +sub HANDLE () {'handle'} + +sub READAHEAD () {'readahead'} +sub LINENUM_START () {'lc_0'} +sub LINENUM () {'lc'} +sub CDATA_MODE_P () {'cdata-mode-p'} +sub CDATA_CLOSE () {'cdata-close'} +sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA +sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true + +sub ALLOW_CFORMAT_P () {'allow-cformat-p'} + +sub new { + my $this = shift; + my($input) = @_; + my $class = ref($this) || $this; + my $self = {}; + bless $self, $class; + + my $handle = sprintf('TMPLTOKENIZER%d', $serial); + $serial += 1; + + no strict; + open($handle, "<$input") || die "$input: $!\n"; + use strict; + $self->{+FILENAME} = $input; + $self->{+HANDLE} = $handle; + $self->{+READAHEAD} = []; + return $self; +} + +############################################################################### + +# Simple getters + +sub filename { + my $this = shift; + return $this->{+FILENAME}; +} + +sub _handle { + my $this = shift; + return $this->{+HANDLE}; +} + +sub fatal_p { + my $this = shift; + return $this->{+FATAL_P}; +} + +sub syntaxerror_p { + my $this = shift; + return $this->{+SYNTAXERROR_P}; +} + +sub has_readahead_p { + my $this = shift; + return @{$this->{+READAHEAD}}; +} + +sub _peek_readahead { + my $this = shift; + return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}]; +} + +sub line_number_start { + my $this = shift; + return $this->{+LINENUM_START}; +} + +sub line_number { + my $this = shift; + return $this->{+LINENUM}; +} + +sub cdata_mode_p { + my $this = shift; + return $this->{+CDATA_MODE_P}; +} + +sub pcdata_mode_p { + my $this = shift; + return $this->{+PCDATA_MODE_P}; +} + +sub js_mode_p { + my $this = shift; + return $this->{+JS_MODE_P}; +} + +sub cdata_close { + my $this = shift; + return $this->{+CDATA_CLOSE}; +} + +sub allow_cformat_p { + my $this = shift; + return $this->{+ALLOW_CFORMAT_P}; +} + +# Simple setters + +sub _set_fatal { + my $this = shift; + $this->{+FATAL_P} = $_[0]; + return $this; +} + +sub _set_syntaxerror { + my $this = shift; + $this->{+SYNTAXERROR_P} = $_[0]; + return $this; +} + +sub _push_readahead { + my $this = shift; + push @{$this->{+READAHEAD}}, $_[0]; + return $this; +} + +sub _pop_readahead { + my $this = shift; + return pop @{$this->{+READAHEAD}}; +} + +sub _append_readahead { + my $this = shift; + $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0]; + return $this; +} + +sub _set_readahead { + my $this = shift; + $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0]; + return $this; +} + +sub _increment_line_number { + my $this = shift; + $this->{+LINENUM} += 1; + return $this; +} + +sub _set_line_number_start { + my $this = shift; + $this->{+LINENUM_START} = $_[0]; + return $this; +} + +sub _set_cdata_mode { + my $this = shift; + $this->{+CDATA_MODE_P} = $_[0]; + return $this; +} + +sub _set_pcdata_mode { + my $this = shift; + $this->{+PCDATA_MODE_P} = $_[0]; + return $this; +} + +sub _set_js_mode { + my $this = shift; + $this->{+JS_MODE_P} = $_[0]; + return $this; +} + +sub _set_cdata_close { + my $this = shift; + $this->{+CDATA_CLOSE} = $_[0]; + return $this; +} + +sub set_allow_cformat { + my $this = shift; + $this->{+ALLOW_CFORMAT_P} = $_[0]; + return $this; +} + +############################################################################### + +use vars qw( $js_EscapeSequence ); +BEGIN { + # Perl quoting is really screwed up, but this common subexp is way too long + $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})}; +} +sub parenleft () { '(' } +sub parenright () { ')' } + +sub split_js ($) { + my ($s0) = @_; + my @it = (); + while (length $s0) { + if ($s0 =~ /^\s+/s) { # whitespace + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment + push @it, $&; + $s0 = $'; + # Keyword or identifier, ECMA-262 p.13 (section 7.5) + } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName + push @it, $&; + $s0 = $'; + # Punctuator, ECMA-262 p.13 (section 7.6) + } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) { + push @it, $&; + $s0 = $'; + # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec + } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) { + push @it, $&; + $s0 = $'; + # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3) + } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) { + push @it, $&; + $s0 = $'; + # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3) + } elsif ($s0 =~ /^0[\da-fA-F]+/s) { + push @it, $&; + $s0 = $'; + # StringLiteral, ECMA-262 p.17 (section 7.7.4) + # XXX SourceCharacter doesn't seem to be defined (?) + } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) { + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!! + push @it, $&; + $s0 = $'; + } + } + return @it; +} + +sub STATE_UNDERSCORE () { 1 } +sub STATE_PARENLEFT () { 2 } +sub STATE_STRING_LITERAL () { 3 } + +# XXX This is a crazy hack. I don't want to write an ECMAScript parser. +# XXX A scanner is one thing; a parser another thing. +sub identify_js_translatables (@) { + my @input = @_; + my @output = (); + # We mark a JavaScript translatable string as in C, i.e., _("literal") + # For simplicity, we ONLY look for "_" "(" StringLiteral ")" + for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) { + my $reset_state_p = 0; + push @output, [0, $input[$i]]; + if ($input[$i] !~ /\S/s) { + ; + } elsif ($state == 0) { + $state = STATE_UNDERSCORE if $input[$i] eq '_'; + } elsif ($state == STATE_UNDERSCORE) { + $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0; + } elsif ($state == STATE_PARENLEFT) { + if ($input[$i] =~ /^(['"])(.*)\1$/s) { + ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2); + } else { + $state = 0; + } + } elsif ($state == STATE_STRING_LITERAL) { + if ($input[$i] eq parenright) { + $output[$j] = [1, $output[$j]->[1], $q, $s]; + } + $state = 0; + } else { + die "identify_js_translatables internal error: Unknown state $state" + } + } + return \@output; +} + +############################################################################### + +sub _extract_attributes ($;$) { + my $this = shift; + my($s, $lc) = @_; + my %attr; + $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags + || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags + + for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) { + my($key, $val, $val_orig, $rest) + = ($1, (defined $3? $3: defined $4? $4: $5), $2, $'); + $i += 1; + $attr{+lc($key)} = [$key, $val, $val_orig, $i]; + $s = $rest; + if ($val =~ /$re_tmpl_include/os) { + warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc; + } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) { + # XXX: we probably should not warn if key is "onclick" etc + # XXX: there's just no reasonable thing to suggest + my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML'); + undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i; + warn_pedantic + "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\"" + . ": $val_orig", + $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p + if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p); + } elsif ($val_orig !~ /^['"]/) { + my $t = $val; $t =~ s/$re_directive_control//os; + warn_pedantic + "Unquoted attribute contains character(s) that should be quoted" + . ": $val_orig", + $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p + if $t =~ /[^-\.A-Za-z0-9]/s; + } + } + my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check + if ($s2 =~ /\S/s) { # should never happen + if ($s =~ /^([^\n]*)\n/s) { # this is even worse + error_normal("Completely confused while extracting attributes: $1", $lc); + error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef); + $this->_set_fatal( 1 ); + } else { + # There's something wrong with the attribute syntax. + # We might be able to deduce a likely cause by looking more. + if ($s =~ /^[a-z0-9]/is && "" =~ /^$re_tag_compat$/s) { + warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc; + } else { + warn_normal "Strange attribute syntax: $s\n", $lc; + } + } + } + return \%attr; +} + +sub _next_token_internal { + my $this = shift; + my($h) = @_; + my($it, $kind); + my $eof_p = 0; + $this->_pop_readahead if $this->has_readahead_p + && !ref $this->_peek_readahead + && !length $this->_peek_readahead; + if (!$this->has_readahead_p) { + my $next = scalar <$h>; + $eof_p = !defined $next; + if (!$eof_p) { + $this->_increment_line_number; + $this->_push_readahead( $next ); + } + } + $this->_set_line_number_start( $this->line_number ); # remember 1st line num + if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj. + ($it, $kind) = ($this->_pop_readahead, undef); + } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do + ; + } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace + ($kind, $it) = (TmplTokenType::TEXT, $&); + $this->_set_readahead( $' ); + # FIXME the following (the [<\s] part) is an unreliable HACK :-( + } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text + ($kind, $it) = (TmplTokenType::TEXT, $&); + $this->_set_readahead( $' ); + warn_normal "Unescaped < in $it\n", $this->line_number_start + if !$this->cdata_mode_p && $it =~ /cdata_close;;) { + if ($this->cdata_mode_p) { + my $next = $this->_pop_readahead; + if ($next =~ /^$cdata_close/is) { + ($kind, $it) = (TmplTokenType::TAG, $&); + $this->_push_readahead( $' ); + $ok_p = 1; + } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) { + ($kind, $it) = (TmplTokenType::TEXT, $1); + $this->_push_readahead( "$2$'" ); + $ok_p = 1; + } else { + ($kind, $it) = (TmplTokenType::TEXT, $next); + $ok_p = 1; + } + } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) { + # If we detect a "closed start tag" but we know that the + # following token looks like a TMPL_VAR, don't stop + my($head, $tail, $post) = ($1, $2, $3); + if ($tail eq '' && $post =~ $re_tmpl_var) { + # Don't bother to show the warning if we're too confused + # FIXME. There's no method for _closed_start_tag_warning + if (!defined $this->{'_closed_start_tag_warning'} + || ($this->{'_closed_start_tag_warning'}->[0] eq $head + && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) { + warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number + if split(/\n/, $head) < 10; + } + $this->{'_closed_start_tag_warning'} = [$head, $this->line_number]; + } else { + ($kind, $it) = (TmplTokenType::TAG, "$head>"); + $this->_set_readahead( $post ); + $ok_p = 1; + warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq ''; + } + } elsif ($this->_peek_readahead =~ /^)$re_directive*.)*-->/os) { + ($kind, $it) = (TmplTokenType::COMMENT, $&); + $this->_set_readahead( $' ); + $ok_p = 1; + $bad_comment_p = 1; + } + last if $ok_p; + my $next = scalar <$h>; + $eof_p = !defined $next; + last if $eof_p; + $this->_increment_line_number; + $this->_append_readahead( $next ); + } + if ($kind ne TmplTokenType::TAG) { + ; + } elsif ($it =~ /^).)*-->/; + if ($kind == TmplTokenType::COMMENT && $it =~ /^