Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve --smart-case handling of meta-characters #672

Open
wants to merge 3 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 98 additions & 3 deletions ack
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use warnings;
our $VERSION = '2.24'; # Check https://beyondgrep.com/ for updates

use 5.008008;
use charnames qw/ :full :short /;

use Getopt::Long 2.38 ();
use Carp 1.04 ();

Expand Down Expand Up @@ -322,12 +324,12 @@ sub build_regex {
$str = "$str\\b" if $pristine_str =~ /\w$/;
}

my $regex_is_lc = $str eq lc $str;
if ( $opt->{i} || ($opt->{smart_case} && $regex_is_lc) ) {
if ( $opt->{i} || ($opt->{smart_case} && regex_is_lc($str)) ) {
$str = "(?i)$str";
}

my $re = eval { qr/$str/m };
# Create a regex object. Use eval-string so \N{NAME} will resolve.
my $re = eval "qr{$str}m";
if ( !$re ) {
die "Invalid regex '$str':\n $@";
}
Expand All @@ -336,6 +338,99 @@ sub build_regex {

}

# Return true if regex consists of only lowercase characters.
# Excludes meta-characters (like /\S/ or /\W/), character escapes
# (like /\xFF/ or /\N{LATIN SMALL LETTER E}/), and other contructs
# that use uppercase characters.
sub regex_is_lc {
my $str = shift;

# Group name has to alphanumeric but can't start with a number.
my $named_capture_re = qr/[^\W\d]\w*/;

# Keep track of number of capturing parenthesis.
my $capture = 0;

# Remove or replace anything that uses an uppercase character but
# isn't an uppercase character.
$str =~ s[
\\
(?:
(?<escape>
x
(?:
[[:xdigit:]]{2} # Hex escape
|
\{[[:xdigit:]]+\} # Bracketed hex escape
)
|
(?<octal>[0-7]{3}) # Octal escape or back-reference
|
o\{[0-7]+\} # Bracketed octal escape
|
c[\p{PosixPrint}] # Control-X escape
|
N\{ (?:
U\+[[:xdigit:]]+ # Numbered Unicode character
|
[^\W\d][^}]* # Named Unicode character
) \}
)
|
[pP]
(?:
[A-Z] # Single-letter Unicode property
|
\{[^}]+\} # Unicode Property
)
|
[gk]\{$named_capture_re\} # Named back-reference
|
k
(?:
<$named_capture_re> # .Net named back-reference
|
'$named_capture_re' # .Net named back-reference
)
|
. # Single character escape
)
|
\(\?
(?:
(?<capture>
<$named_capture_re> # Named capture
|
'$named_capture_re' # Named capture
|
P<$named_capture_re> # Python named capture
)
|
P=$named_capture_re\) # Python named back-reference
)
|
(?<capture>\((?!\?)) # Capturing parenthesis
]{
my $esc;
if ($+{capture}) {
$capture++;
} elsif (my $oct = $+{octal}) {
$esc = $oct
if length $oct > 1 # Single digit is back-reference
and $oct =~ /^0/ # Leading zero is octal
|| $oct > $capture; # Has capture group is back-reference
} elsif ($+{escape}) {
$esc = $+{escape};
}
$esc
? eval qq/"\\$esc"/ # Replace escapes with characters
: ''; # Remove everything else
}gxe;

# Check if actual characters are all lowercase.
$str eq lc $str;
}

my $match_column_number;

{
Expand Down
103 changes: 95 additions & 8 deletions t/longopts.t
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ This tests whether ack's command line options work as expected.
=cut

use Test::More;
use charnames qw/ :full :short /;

# --no-recurse is inconsistent w/--nogroup

plan tests => 38;
plan tests => 66;

use lib 't';
use Util;
Expand Down Expand Up @@ -60,22 +61,108 @@ for my $arg ( qw( -i --ignore-case ) ) {
SMART_CASE: {
my @files = 't/swamp/options.pl';
my $opt = '--smart-case';
my $re = qr/ALL IN UPPER CASE/;

like(
+run_ack( $opt, 'upper case', @files ),
qr{UPPER CASE},
+run_ack( $opt, 'all in upper case', @files ),
$re,
qq{$opt turn on ignore-case when PATTERN has no upper}
);
unlike(
+run_ack( $opt, 'Upper case', @files ),
qr{UPPER CASE},
+run_ack( $opt, 'all in UPPER case', @files ),
$re,
qq{$opt does nothing when PATTERN has upper}
);

like(
+run_ack( $opt, '-i', 'UpPer CaSe', @files ),
qr{UPPER CASE},
+run_ack( $opt, '-i', 'AlL In UpPer CaSe', @files ),
$re,
qq{-i overrides $opt, forcing ignore case, even when PATTERN has upper}
);

# Uppercase characters that aren't really uppercase.
like(
+run_ack( $opt, 'all\Win\Dup\Bper ca\Se', @files ),
$re,
qq{$opt ignores upper in meta-characters}
);
like(
+run_ack( $opt, 'a\x6Cl i\x{006E} \N{U+0075}pper[^\cJ]cas\N{LATIN SMALL LETTER E}', @files ),
$re,
qq{$opt ignores upper in character escapes}
);
like(
+run_ack( $opt, '\pLll\p{PosixSpace}in\PNupper\P{PosixDigit}case', @files ),
$re,
qq{$opt ignores upper in Unicode properties}
);
like(
+run_ack( $opt, q[a(?<L>l)\k{L}(?<SPACE> )in\k'SPACE'u(?'P'p)\g{P}(?P<E>e)r(?P=SPACE)cas\k{E}], @files ),
$re,
qq{$opt ignores upper in named captures}
);

# Uppercase characters may have been escaped.
unlike(
+run_ack( $opt, '\x41ll in upper case', @files ),
$re,
qq{$opt sees upper "A" in hex escape}
);
unlike(
+run_ack( $opt, 'all \x{0049}n upper case', @files ),
$re,
qq{$opt sees upper "I" in bracketed hex escape}
);
unlike(
+run_ack( $opt, 'all in \125pper case', @files ),
$re,
qq{$opt sees upper "U" in octal escape}
);
SKIP: {
{
no warnings;
skip "Bracketed octal escapes not implemented", 2
if "\o{122}" eq "o{122}";
}
unlike(
+run_ack( $opt, 'all in uppe\o{122} case', @files ),
$re,
qq{$opt sees upper "R" in bracketed octal escape}
);
}
unlike(
+run_ack( $opt, 'all in upper \N{U+0043}ase', @files ),
$re,
qq{$opt sees upper "CU" in numbered Unicode character}
);
unlike(
+run_ack( $opt, 'all in upper cas\N{LATIN CAPITAL LETTER E}', @files ),
$re,
qq{$opt sees upper "E" in named Unicode character}
);

# \120 is either back-reference to 120th capture or the letter "P".
my $start = '(' x 119;
my $end = ')' x 120;
unlike(
+run_ack( $opt, 'all in u(((p)))\120er case', @files ),
$re,
qq{$opt sees upper "P" in octal escape}
);
like(
+run_ack( $opt, "all in u$start(p$end\\120er case", @files ),
$re,
qq{$opt sees back-reference to 120th capture}
);
unlike(
+run_ack( $opt, "all in u$start(?:p$end\\120er case", @files ),
$re,
qq{$opt sees 119 capture groups - (?:non-capturing)}
);
like(
+run_ack( $opt, "all in u$start(?<NAME>p$end\\120er case", @files ),
$re,
qq{$opt sees 120 capture groups - (?<NAME>capturing)}
);
}

# Invert match
Expand Down