diff --git a/ack b/ack index 921d8574..8191745f 100755 --- a/ack +++ b/ack @@ -5,6 +5,8 @@ use warnings; our $VERSION = '2.24'; # Check https://beyondgrep.com/ for updates use 5.008008; +use charnames qw/ :full :short /; + use Getopt::Long 2.38 (); use Carp 1.04 (); @@ -322,12 +324,12 @@ sub build_regex { $str = "$str\\b" if $pristine_str =~ /\w$/; } - my $regex_is_lc = $str eq lc $str; - if ( $opt->{i} || ($opt->{smart_case} && $regex_is_lc) ) { + if ( $opt->{i} || ($opt->{smart_case} && regex_is_lc($str)) ) { $str = "(?i)$str"; } - my $re = eval { qr/$str/m }; + # Create a regex object. Use eval-string so \N{NAME} will resolve. + my $re = eval "qr{$str}m"; if ( !$re ) { die "Invalid regex '$str':\n $@"; } @@ -336,6 +338,99 @@ sub build_regex { } +# Return true if regex consists of only lowercase characters. +# Excludes meta-characters (like /\S/ or /\W/), character escapes +# (like /\xFF/ or /\N{LATIN SMALL LETTER E}/), and other contructs +# that use uppercase characters. +sub regex_is_lc { + my $str = shift; + + # Group name has to alphanumeric but can't start with a number. + my $named_capture_re = qr/[^\W\d]\w*/; + + # Keep track of number of capturing parenthesis. + my $capture = 0; + + # Remove or replace anything that uses an uppercase character but + # isn't an uppercase character. + $str =~ s[ + \\ + (?: + (? + x + (?: + [[:xdigit:]]{2} # Hex escape + | + \{[[:xdigit:]]+\} # Bracketed hex escape + ) + | + (?[0-7]{3}) # Octal escape or back-reference + | + o\{[0-7]+\} # Bracketed octal escape + | + c[\p{PosixPrint}] # Control-X escape + | + N\{ (?: + U\+[[:xdigit:]]+ # Numbered Unicode character + | + [^\W\d][^}]* # Named Unicode character + ) \} + ) + | + [pP] + (?: + [A-Z] # Single-letter Unicode property + | + \{[^}]+\} # Unicode Property + ) + | + [gk]\{$named_capture_re\} # Named back-reference + | + k + (?: + <$named_capture_re> # .Net named back-reference + | + '$named_capture_re' # .Net named back-reference + ) + | + . # Single character escape + ) + | + \(\? + (?: + (? + <$named_capture_re> # Named capture + | + '$named_capture_re' # Named capture + | + P<$named_capture_re> # Python named capture + ) + | + P=$named_capture_re\) # Python named back-reference + ) + | + (?\((?!\?)) # Capturing parenthesis + ]{ + my $esc; + if ($+{capture}) { + $capture++; + } elsif (my $oct = $+{octal}) { + $esc = $oct + if length $oct > 1 # Single digit is back-reference + and $oct =~ /^0/ # Leading zero is octal + || $oct > $capture; # Has capture group is back-reference + } elsif ($+{escape}) { + $esc = $+{escape}; + } + $esc + ? eval qq/"\\$esc"/ # Replace escapes with characters + : ''; # Remove everything else + }gxe; + + # Check if actual characters are all lowercase. + $str eq lc $str; +} + my $match_column_number; { diff --git a/t/longopts.t b/t/longopts.t index 666bd5b5..72c8d14d 100644 --- a/t/longopts.t +++ b/t/longopts.t @@ -10,10 +10,11 @@ This tests whether ack's command line options work as expected. =cut use Test::More; +use charnames qw/ :full :short /; # --no-recurse is inconsistent w/--nogroup -plan tests => 38; +plan tests => 66; use lib 't'; use Util; @@ -60,22 +61,108 @@ for my $arg ( qw( -i --ignore-case ) ) { SMART_CASE: { my @files = 't/swamp/options.pl'; my $opt = '--smart-case'; + my $re = qr/ALL IN UPPER CASE/; + like( - +run_ack( $opt, 'upper case', @files ), - qr{UPPER CASE}, + +run_ack( $opt, 'all in upper case', @files ), + $re, qq{$opt turn on ignore-case when PATTERN has no upper} ); unlike( - +run_ack( $opt, 'Upper case', @files ), - qr{UPPER CASE}, + +run_ack( $opt, 'all in UPPER case', @files ), + $re, qq{$opt does nothing when PATTERN has upper} ); - like( - +run_ack( $opt, '-i', 'UpPer CaSe', @files ), - qr{UPPER CASE}, + +run_ack( $opt, '-i', 'AlL In UpPer CaSe', @files ), + $re, qq{-i overrides $opt, forcing ignore case, even when PATTERN has upper} ); + + # Uppercase characters that aren't really uppercase. + like( + +run_ack( $opt, 'all\Win\Dup\Bper ca\Se', @files ), + $re, + qq{$opt ignores upper in meta-characters} + ); + like( + +run_ack( $opt, 'a\x6Cl i\x{006E} \N{U+0075}pper[^\cJ]cas\N{LATIN SMALL LETTER E}', @files ), + $re, + qq{$opt ignores upper in character escapes} + ); + like( + +run_ack( $opt, '\pLll\p{PosixSpace}in\PNupper\P{PosixDigit}case', @files ), + $re, + qq{$opt ignores upper in Unicode properties} + ); + like( + +run_ack( $opt, q[a(?l)\k{L}(? )in\k'SPACE'u(?'P'p)\g{P}(?Pe)r(?P=SPACE)cas\k{E}], @files ), + $re, + qq{$opt ignores upper in named captures} + ); + + # Uppercase characters may have been escaped. + unlike( + +run_ack( $opt, '\x41ll in upper case', @files ), + $re, + qq{$opt sees upper "A" in hex escape} + ); + unlike( + +run_ack( $opt, 'all \x{0049}n upper case', @files ), + $re, + qq{$opt sees upper "I" in bracketed hex escape} + ); + unlike( + +run_ack( $opt, 'all in \125pper case', @files ), + $re, + qq{$opt sees upper "U" in octal escape} + ); + SKIP: { + { + no warnings; + skip "Bracketed octal escapes not implemented", 2 + if "\o{122}" eq "o{122}"; + } + unlike( + +run_ack( $opt, 'all in uppe\o{122} case', @files ), + $re, + qq{$opt sees upper "R" in bracketed octal escape} + ); + } + unlike( + +run_ack( $opt, 'all in upper \N{U+0043}ase', @files ), + $re, + qq{$opt sees upper "CU" in numbered Unicode character} + ); + unlike( + +run_ack( $opt, 'all in upper cas\N{LATIN CAPITAL LETTER E}', @files ), + $re, + qq{$opt sees upper "E" in named Unicode character} + ); + + # \120 is either back-reference to 120th capture or the letter "P". + my $start = '(' x 119; + my $end = ')' x 120; + unlike( + +run_ack( $opt, 'all in u(((p)))\120er case', @files ), + $re, + qq{$opt sees upper "P" in octal escape} + ); + like( + +run_ack( $opt, "all in u$start(p$end\\120er case", @files ), + $re, + qq{$opt sees back-reference to 120th capture} + ); + unlike( + +run_ack( $opt, "all in u$start(?:p$end\\120er case", @files ), + $re, + qq{$opt sees 119 capture groups - (?:non-capturing)} + ); + like( + +run_ack( $opt, "all in u$start(?p$end\\120er case", @files ), + $re, + qq{$opt sees 120 capture groups - (?capturing)} + ); } # Invert match