You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

207 lines
7.4 KiB

  1. #ParseMaster (July 25 2005)
  2. # Based on "ParseMaster.js" by Dean Edwards <http://dean.edwards.name/>
  3. # Ported to Perl by Rob Seiler, ELR Software Pty Ltd <http://www.elr.com.au>
  4. # Copyright 2005. License <http://creativecommons.org/licenses/LGPL/2.1/>
  5. package ParseMaster;
  6. use strict;
  7. use Data::Dumper;
  8. # Package wide variable declarations
  9. use vars qw/$VERSION
  10. @_X_escaped @_X_patterns
  11. /;
  12. $VERSION = '017';
  13. # constants
  14. my $X_EXPRESSION = 0;
  15. my $X_REPLACEMENT = 1;
  16. my $X_LENGTH = 2;
  17. # re's used to determine nesting levels
  18. my $X_GROUPS = qr/\(/o; # NB: Requires g modifier!
  19. my $X_SUB_REPLACE = qr/\$\d/o;
  20. my $X_INDEXED = qr/^\$\d+$/o;
  21. my $XX_ESCAPE = qr/\\./o; # NB: Requires g modifier!
  22. my $XX_DELETED = qr/\001[^\001]*\001/o; # NB: Requires g modifier!
  23. my $DIGIT = qr/[^\D]/o; # Yep - this is a digit - contains no non-digits
  24. # Constructor
  25. sub new {
  26. my $class = shift;
  27. my $self = {};
  28. @_X_escaped = (); # Re-initialize global for each instance
  29. @_X_patterns = (); # Re-initialize global for each instance
  30. # Instance variables - access by similarly named set/get functions
  31. $self->{_ignoreCase_} = 0;
  32. $self->{_escapeChar_} = '';
  33. bless ($self, $class);
  34. return $self;
  35. }
  36. sub ignoreCase {
  37. my ($self, $value) = @_;
  38. if (defined($value)) {
  39. $self->{_ignoreCase_} = $value;
  40. }
  41. return $self->{_ignoreCase_};
  42. }
  43. sub escapeChar{
  44. my ($self, $value) = @_;
  45. if (defined($value)) {
  46. $self->{_escapeChar_} = $value;
  47. }
  48. return $self->{_escapeChar_};
  49. }
  50. #######################
  51. # Public Parsemaster functions
  52. my $X_DELETE = sub(@$) {
  53. my $X_offset = pop;
  54. my @X_match = @_;
  55. return (chr(001) . $X_match[$X_offset] . chr(001));
  56. }; # NB semicolon required for closure!
  57. # create and add a new pattern to the patterns collection
  58. sub add {
  59. my ($self, $expression, $X_replacement) = @_;
  60. if (!$X_replacement) {$X_replacement = $X_DELETE};
  61. # count the number of sub-expressions
  62. my $temp = &_X_internalEscape($expression);
  63. my $length = 1; # Always at least one because each pattern is itself a sub-expression
  64. $length += $temp =~ s/$X_GROUPS//g; # One way to count the left capturing parentheses in the regexp string
  65. # does the pattern deal with sub-expressions?
  66. if ((ref($X_replacement) ne "CODE") && ($X_replacement =~ m/$X_SUB_REPLACE/)) {
  67. if ($X_replacement =~ m/$X_INDEXED/) { # a simple lookup? (eg "$2")
  68. # store the index (used for fast retrieval of matched strings)
  69. $X_replacement = substr($X_replacement,1) - 1;
  70. }
  71. else { # a complicated lookup (eg "Hello $2 $1")
  72. my $i = $length;
  73. while ($i) { # Had difficulty getting Perl to do Dean's splitting and joining of strings containing $'s
  74. my $str = '$a[$o+' . ($i-1) . ']'; # eg $a[$o+1]
  75. $X_replacement =~ s/\$$i/$str/; # eg $2 $3 -> $a[$o+1] $a[$o+2]
  76. $i--;
  77. }
  78. # build a function to do the lookup - returns interpolated string of array lookups
  79. $X_replacement = eval('sub {my $o=pop; my @a=@_; return "' . $X_replacement . '"};');
  80. }
  81. }
  82. else {}
  83. # pass the modified arguments
  84. &_X_add($expression || q/^$/, $X_replacement, $length);
  85. }
  86. # execute the global replacement
  87. sub exec {
  88. #print Dumper(@_X_patterns);
  89. my ($self, $X_string) = @_;
  90. my $escChar = $self->escapeChar();
  91. my $ignoreCase = $self->ignoreCase();
  92. my ($regexp,$captures) = &_getPatterns(); # Concatenated and parenthesized regexp eg '(regex1)|(regex2)|(regex3)' etc
  93. $X_string = &_X_escape($X_string, $escChar);
  94. if ($ignoreCase) {$X_string =~ s/$regexp/{&_X_replacement(&_matchVars($captures,\$X_string))}/gie} # Pass $X_String as a
  95. else {$X_string =~ s/$regexp/{&_X_replacement(&_matchVars($captures,\$X_string))}/ge} # reference for speed
  96. $X_string = &_X_unescape($X_string, $escChar);
  97. $X_string =~ s/$XX_DELETED//g;
  98. return $X_string;
  99. }
  100. sub _X_add {
  101. push (@_X_patterns, [@_]); # Save each argument set as is into an array of arrays
  102. }
  103. # this is the global replace function (it's quite complicated)
  104. sub _X_replacement {
  105. my (@arguments) = @_;
  106. #print Dumper (@arguments);
  107. if ($arguments[0] le '') {return ''}
  108. # Dereference last index (source String) here - faster than in _matchVars (maybe not needed at all?)
  109. $arguments[$#arguments] = ${$arguments[$#arguments]};
  110. my $i = 1;
  111. # loop through the patterns
  112. for (my $j=0; $j<scalar(@_X_patterns); $j++) { # Loop through global all @_X_patterns
  113. my @X_pattern = @{$_X_patterns[$j]};
  114. # do we have a result? NB: "if ($arguments[$i])" as in Dean's Javascript is false for the value 0!!!
  115. if ((defined $arguments[$i]) && ($arguments[$i] gt '')) {
  116. my $X_replacement = $X_pattern[$X_REPLACEMENT];
  117. # switch on type of $replacement
  118. if (ref($X_replacement) eq "CODE") { # function
  119. return &$X_replacement(@arguments,$i);
  120. }
  121. elsif ($X_replacement =~ m/$DIGIT/) { # number (contains no non-digits)
  122. return $arguments[$X_replacement + $i];
  123. }
  124. else { # default
  125. return $X_replacement; # default
  126. }
  127. } # skip over references to sub-expressions
  128. else {$i += $X_pattern[$X_LENGTH]}
  129. }
  130. }
  131. #######################
  132. # Private functions
  133. #######################
  134. # encode escaped characters
  135. sub _X_escape {
  136. my ($X_string, $X_escapeChar) = @_;
  137. if ($X_escapeChar) {
  138. my $re = '\\'.$X_escapeChar.'(.)';
  139. $X_string =~ s/$re/{push(@_X_escaped,$1); $X_escapeChar}/ge;
  140. }
  141. return $X_string;
  142. }
  143. # decode escaped characters
  144. sub _X_unescape {
  145. my ($X_string, $X_escapeChar) = @_;
  146. if ($X_escapeChar) { # We'll only do this if there is an $X_escapeChar!
  147. my $re = '\\'.$X_escapeChar;
  148. $X_string =~ s/$re/{$X_escapeChar . (shift(@_X_escaped))}/ge; # Don't use Dean Edwards as below 'or' here - because zero will return ''!
  149. # $X_string =~ s/$re/{$X_escapeChar . (shift(@_X_escaped) || '')}/ge;
  150. }
  151. return $X_string;
  152. }
  153. sub _X_internalEscape {
  154. my ($string) = shift;
  155. $string =~ s/$XX_ESCAPE//g;
  156. return $string;
  157. }
  158. # Builds an array of match variables to (approximately) emulate that available in Javascript String.replace()
  159. sub _matchVars {
  160. my ($m,$sref) = @_;
  161. my @args = (1..$m); # establish the number potential memory variables
  162. my @mv = map {eval("\$$_")} @args; # matchvarv[1..m] = the memory variables $1 .. $m
  163. unshift (@mv, $&); # matchvar[0] = the substring that matched
  164. push (@mv, length($`)); # matchvar[m+1] = offset within the source string where the match occurred (= length of prematch string)
  165. push (@mv, $sref); # matchvar[m+2] = reference to full source string (dereference in caller if/when needed)
  166. #print Dumper (@mv);
  167. return @mv;
  168. }
  169. sub _getPatterns {
  170. my @Patterns = ();
  171. my $lcp = 0;
  172. for (my $i=0; $i<scalar(@_X_patterns); $i++) { # Loop through global all @_patterns
  173. push (@Patterns, $_X_patterns[$i][$X_EXPRESSION]); # accumulate the expressions
  174. $lcp += $_X_patterns[$i][$X_LENGTH]; # sum the left capturing parenthesis counts
  175. }
  176. my $str = "(" . join(')|(',@Patterns). ")"; # enclose each pattern in () separated by "|"
  177. return ($str, $lcp);
  178. }
  179. ##################
  180. # END #
  181. ##################
  182. 1; # ParseMaster #
  183. ##################