perl regex

by dan 10/05/06 12:00:00 Article perl.


Regular Expression

/ reg exp /
# match string, returns 1 or 0 (True/False)
if (/ab/) {} # if $_ matches /ab/
/ reg exp /i
# ignore case
$x=~/ reg exp/
# match using $x, not $_
$a=""abcdef"";
if ($a=~/def/) { }
if (<STDIN>=~/^[yY]/ {#yes}
m#regexp#
#allows different delimiters to be used. Good if using lots of '/' in regexp.
m@usr@  #same as /usr/
m*usr/bin*   #same as /usr\/bin/
m#^/usr/etc#  #same as /^\/usr\/etc/
s/regexp/replacement str/
# Substitute on $_
s/ab*c/def/ # replace ab*c w/ 'def'
s/regexp/repstr/g
# global sub, not just the 1st match # returns # of letters that it replaced.
s#regexp#rep str#
#can use any char as delimiter as long as they are 3. s@abc@123@ #same as s/abc/123/
$x=~s/regexp/repstr/;
=~ alt target instead of $_ $d{""t""}=~s/^/x/; #prepend 'x' to hash var's value
$1,$2,$3,...
#memorized part of reg exp are saved in $1,$2,... $_=""abc""; /(.)(.)(.)/; # $1='a',$2='b',$3='c'; $_=""a test""; s/(w+)/<$1>/g; # $_=""<a> <test>""
($var1,$var2)=/(.)(.)(.)/;
# $var1='a',$var2='b', alternative to $1,$2
$&
# / / saves the matched string in this var.
$_=""abcdef"";
/cd/;
# $& now contains 'cd'

$`

  • / / saves the string before match, (backquote)
  • $` is 'ab'

$'

  • / / saves the string after match, (single quote)
  • $' is now 'ef'

REG EXP CHARS

.
# match any char except n
[ ]
    #match 1 char
    /[abc]/  # if str contains 'a','b',or 'c'
    /[a-z]/ # any lower-case alphabet char
    /[0-9\-]/  #0-9 or '-'
    /[a-z0-9A-Z_]/  #alphanumeric or '_'

[^ ]
    #match except char
    /[^0-9]/  #match any non-digit char
    /[^aeiou]/ #match anything except lower-case vowel

\d
    # [0-9] shortcut

\D
    # [^0-9]

\w
    #[a-zA-Z0-9_]

\W
    #[^a-zA-Z0-9_]

\s
    #[ \r\t\n\f]

\S
    #[^ \r\t\n\f]

mixed example:

/[\da-fA-F]/  #matches 1 hex digit
/\d/ # match if 1st char is a digit
*
    # multiple, 0 or more
    /ab*c/  # a,0 or more b, c

+
    # multiple, 1 or more
    /ab+c/
    s/x+/QQ/  # if $_=""a xxx b"" -> ""a QQ b""

?
    #multiple, 0 or 1, no more
    /ab?c/

/x{min,max}/
    # general multipler
/t{5,10}/ #matches 5-10 't's.
/t{5,}/  # same as /tttt+/
/t{5}/  # same as /ttttt/
/t{0,5}/ # 5 or less t's
/a.{5}b/ #a separated by any 5 char and b.

Greedy Rule

Leftmost is greediest
$_=""a xxx c xxx c xxx d"";
/a.*c.*d/;  # matches 2nd cxx, not 1st cxx
/a.*?c.*d/; # lazy,non-greedy, matches 1st cxx
using "".*?"" can make it non-greedy. Not sure???
Automatically backtracks to find correct match
    $_=""/a xxx ce xxx ci xx d"";
    /a.*ce.*d/  # automatically backtracks
/( )/
    # store in memory. Also sets precedence.  See Precedence.
    /fred(.)barney/ # all characters between fred and barney are stored in 1st mem

/ \1 \2 \3 /
    #recall memory. \1=1st mem, \2=2nd mem, \3=3rd mem...
    # They are also stored in var $1,$2,$3,...
    /fred(.)barney\1/ # match fred, any char, barney, any char (that is same as 1st any char)
        # not same as /fred.barney./ since 1st char must match 2nd char.
    /a(.)b(.)c\2d\1/;
        # 'a',any char,'b', any char, c, char (same as after b),d, char(same as after a)
    /a(.*)b\1c/;
        # 'a',any # of char,b, same string after 'a', 'c'.
/s1|s2|s3/
    # match either words
    /red|green|blue/ # match 1 of 3 color
\b
    #boundary anchor (ie alphanumeric char boundary)
    #if word is alphanumeric, it will form boundary w/ non-alphanumeric
    # if word is non-alphanumeric, it will form boundary w/ alphanumeric char
    # Must be either at beginning or end. /\b.../, /...\b/
    /fred\b/ # match fred, but not freddy
    /\bmo/  #match moby, but not elmo
    /\b\+\b/  # match ""x+y"" but not ""++""
    /ABC\bDEF/ #impossible bc \b must be either beginning or end
\B
    #require no boundary
    /\bFred\B/, matches ""Frederick"", but not ""Fred Foe"" since boundary exists after Fred.
^
    # beginning of line, must be the 1st char.
    /^a/  # 1st letter must be 'a'.
    /aaa^/ # Wrong. Cannot have ^ at anywhere except 1st
    s/^/x/; #^ by itself means prepend. Prepend ""x"" to $_.

# don't be confused with [^]
$
    # end of line, must be last char
    /c$/  # last char must be 'c'
    /$cccc/  #Wrong. It looks for var $cccc instead.

$var
    #all vars are interpolated.
    $word=""ab*c"";
    if (/$word/) { }  #becomes /ab*c/

\Q...\E
    #use literal, vars are interpolated but as literal, not reg exp.
    $word=""ab*c"";
    if (/\Q$word\E) { } #becomes /ab\*c/, not /ab*c/

Precedence

  1. ( ), (?: ) #paren
  2. ?+* {m,n} #multiplier
  3. abc ^$b #sequence, anchoring
  4. a|b #alternative
/a|b*/ = a|(b*), not (a|b)*
abc*= ab,abc,abcc,abccc,...
(abc)*="""",abc,abcabc,abcabcabc,...
^x|y= matches x as 1st char, or y anywhere
^(x|y) = 1st char/word must be x or y
a|bc|d = a,bc,or d
(a|b)(c|d)= ac,bc,ad,bd
(red|blue)car= redcar , bluecar

TRANSLITERATION

tr/old/new/;
    #subst old pattern with new pattern,

tr/ab/ba;
    #subst a->b, b->a.

tr/a-z/A-Z/;
    # capitalize.

Tr/a-z/x/;
    # replace all lower-case to 'x'

tr/a-z/ABC/;
        # cap a-c. all other lower-case to 'C'

tr/a-z/x/d;
        # ""d""= delete all non-matching char
        # ie a becomes x 'b-z' are deleted. Others are not deleted.
        # 'abcDEF' -> 'xDEF'

$count=tr/../../;
    # Counts

$count=tr/a/z//;
        # counts # of letters that are lower-case.

$count=tr/a-z/A-Z/;
        # upper-case and counts # of chars that were converted

tr/../../c;
    $count=tr/a-z//c;
        # 'c' compliment, ie all except
        # count all letter except lowercase a-z.

$count=tr/a-z/_/c;
    # replace all char to '_' except a-z

$count=tr/a-z//cd;
    # delete all letter except a-z

tr/../../s
    # 'S'=squeeze, ie multiple letter are replaced by 1 letter if same.
    $_='abcccc'; tr/c/x/s;
        # result: 'abx'
    $_='bc def ghi'; tr/a-z/x/s;
        # result: 'x x x'
    $_='abc -def--ghi'; tr/a-z/_/cs;
        # result: 'abc_def_ghi'

REG EXP FUNCTIONS

@a=split(/regexp/,$str) ;
    #splits line into array of words at the regexp match.
    $str= $_ by default.
    $_=""abc:def:123"";
    @a=split(/:/) ;  # @a=(abc,def,123)

@a=split;
    # same as @a=split(/\s+/,$_) ;  #nice shortcut.

$bigString=join($glue,@list) ;
    # opposite of split, joins array of str into 1 big string
    @list=(a,b,c) ;
    $result=join(':"",@list) ;   #result=""a:b:c"";
    $result=join(""+"",""5"",""7"") ;  # =""5+7""

BASIC STANDARD I/O

$a=<STDIN>;
        # read line
        while (defined($line=<STDIN>)) {}
        while (<STDIN>) { chomp} # shorter version of above
@a=<STDIN>;
        # read multiple lines
<>
        Gets arg from @ARGV. @ARGV contains cmdline arg
        a.pl : While (<>) {print;}
        >a.pl a.txt b.txt c.txt
        # outputs all text files. If no arg is defined, it uses STDIN.
        # sim to 'cat'
printf() ;
        # just like C
open(FILEHANDLE,""filename"") ;
        #Open file to read.
        Open(MY_TEXT,""me.txt"") ;
open(FILEHANDLE,"">out.txt"") ;
        #'>' = WRITE mode
open(FILEHANDLE,"">>append.txt"") ;
        #'>>'=APPEND mode
close(FILEHANDLE) ;
        #close file.
        Close (MY_TEXT) ;
$a=<FILEHANDLE>;
        #Read a line from file.
@a=<FILEHANDLE>;
        #read all lines from file.
        Open(PWD,""/etc/passwd"") ;
        While (<PWD>) { print}
        # each line already contains \n so no need to print \n.
print FILEHANDLE "" "";
        #write line to file
        print MY_TEXT ""Hello\n"";
CopyFile Ex
        Open(IN,$a) || die $!;
        Open(OUT,$b) || die $!;
        While (<IN>) { print OUT $_;}
        Close (IN) || die;
        Close (OUT) || die;

FILE TEST FUNC ""-X""

Works on both filename and FILEHANDLE

-e
        #filename exists
        if (-e $filename) { }
-r
        #file is readable
        if (-r $filename) { }
-w
        #is writable
-x
        #is executable
-o
        #is owned by user
-R,-W,-X,-O
        # is readable,writable,... by real user, not effective user
-z
        # file exists and has 0 size (dir is never 0)
-s
        # file/dir exists and is non-zero size
-f
        # is a file
-d
        # is a dir
-l
        #is a symbolic (soft) link
-S
        # is a socket
-p
        # is a pipe
-b
        # block-special file (mountable disk)
-c
        # char-special file (I/O device)
-u
        # is setuid
-g
        # is setgid
-k
        #has sticky bit set
-t
        #isatty() is true
-T
        #is a text file
-B
        #is a binary file
-M
        # returns modified date age in # of days
-A
        #returns accessed date age in # of days
-C
        #returns inode mod age in # of days


Digg! del.icio.us reddit furl