06: Pattern matching in strings

STA35B: Statistical Data Science 2

Akira Horiguchi

Data we will look at:

stringr::fruit: 80 fruits

fruit

 [1] "apple"             "apricot"           "avocado"          
 [4] "banana"            "bell pepper"       "bilberry"         
 [7] "blackberry"        "blackcurrant"      "blood orange"     
[10] "blueberry"         "boysenberry"       "breadfruit"       
[13] "canary melon"      "cantaloupe"        "cherimoya"        
[16] "cherry"            "chili pepper"      "clementine"       
[19] "cloudberry"        "coconut"           "cranberry"        
[22] "cucumber"          "currant"           "damson"           
[25] "date"              "dragonfruit"       "durian"           
[28] "eggplant"          "elderberry"        "feijoa"           
[31] "fig"               "goji berry"        "gooseberry"       
[34] "grape"             "grapefruit"        "guava"            
[37] "honeydew"          "huckleberry"       "jackfruit"        
[40] "jambul"            "jujube"            "kiwi fruit"       
[43] "kumquat"           "lemon"             "lime"             
[46] "loquat"            "lychee"            "mandarine"        
[49] "mango"             "mulberry"          "nectarine"        
[52] "nut"               "olive"             "orange"           
[55] "pamelo"            "papaya"            "passionfruit"     
[58] "peach"             "pear"              "persimmon"        
[61] "physalis"          "pineapple"         "plum"             
[64] "pomegranate"       "pomelo"            "purple mangosteen"
[67] "quince"            "raisin"            "rambutan"         
[70] "raspberry"         "redcurrant"        "rock melon"       
[73] "salal berry"       "satsuma"           "star fruit"       
[76] "strawberry"        "tamarillo"         "tangerine"        
[79] "ugli fruit"        "watermelon"

stringr::words: 980 common English words

words

  [1] "a"           "able"        "about"       "absolute"    "accept"     
  [6] "account"     "achieve"     "across"      "act"         "active"     
 [11] "actual"      "add"         "address"     "admit"       "advertise"  
 [16] "affect"      "afford"      "after"       "afternoon"   "again"      
 [21] "against"     "age"         "agent"       "ago"         "agree"      
 [26] "air"         "all"         "allow"       "almost"      "along"      
 [31] "already"     "alright"     "also"        "although"    "always"     
 [36] "america"     "amount"      "and"         "another"     "answer"     
 [41] "any"         "apart"       "apparent"    "appear"      "apply"      
 [46] "appoint"     "approach"    "appropriate" "area"        "argue"      
 [51] "arm"         "around"      "arrange"     "art"         "as"         
 [56] "ask"         "associate"   "assume"      "at"          "attend"     
 [61] "authority"   "available"   "aware"       "away"        "awful"      
 [66] "baby"        "back"        "bad"         "bag"         "balance"    
 [71] "ball"        "bank"        "bar"         "base"        "basis"      
 [76] "be"          "bear"        "beat"        "beauty"      "because"    
 [81] "become"      "bed"         "before"      "begin"       "behind"     
 [86] "believe"     "benefit"     "best"        "bet"         "between"    
 [91] "big"         "bill"        "birth"       "bit"         "black"      
 [96] "bloke"       "blood"       "blow"        "blue"        "board"      
[101] "boat"        "body"        "book"        "both"        "bother"     
[106] "bottle"      "bottom"      "box"         "boy"         "break"      
[111] "brief"       "brilliant"   "bring"       "britain"     "brother"    
[116] "budget"      "build"       "bus"         "business"    "busy"       
[121] "but"         "buy"         "by"          "cake"        "call"       
[126] "can"         "car"         "card"        "care"        "carry"      
[131] "case"        "cat"         "catch"       "cause"       "cent"       
[136] "centre"      "certain"     "chair"       "chairman"    "chance"     
[141] "change"      "chap"        "character"   "charge"      "cheap"      
[146] "check"       "child"       "choice"      "choose"      "Christ"     
[151] "Christmas"   "church"      "city"        "claim"       "class"      
[156] "clean"       "clear"       "client"      "clock"       "close"      
[161] "closes"      "clothe"      "club"        "coffee"      "cold"       
[166] "colleague"   "collect"     "college"     "colour"      "come"       
[171] "comment"     "commit"      "committee"   "common"      "community"  
[176] "company"     "compare"     "complete"    "compute"     "concern"    
[181] "condition"   "confer"      "consider"    "consult"     "contact"    
[186] "continue"    "contract"    "control"     "converse"    "cook"       
[191] "copy"        "corner"      "correct"     "cost"        "could"      
[196] "council"     "count"       "country"     "county"      "couple"     
[201] "course"      "court"       "cover"       "create"      "cross"      
[206] "cup"         "current"     "cut"         "dad"         "danger"     
[211] "date"        "day"         "dead"        "deal"        "dear"       
[216] "debate"      "decide"      "decision"    "deep"        "definite"   
[221] "degree"      "department"  "depend"      "describe"    "design"     
[226] "detail"      "develop"     "die"         "difference"  "difficult"  
[231] "dinner"      "direct"      "discuss"     "district"    "divide"     
[236] "do"          "doctor"      "document"    "dog"         "door"       
[241] "double"      "doubt"       "down"        "draw"        "dress"      
[246] "drink"       "drive"       "drop"        "dry"         "due"        
[251] "during"      "each"        "early"       "east"        "easy"       
[256] "eat"         "economy"     "educate"     "effect"      "egg"        
[261] "eight"       "either"      "elect"       "electric"    "eleven"     
[266] "else"        "employ"      "encourage"   "end"         "engine"     
[271] "english"     "enjoy"       "enough"      "enter"       "environment"
[276] "equal"       "especial"    "europe"      "even"        "evening"    
[281] "ever"        "every"       "evidence"    "exact"       "example"    
[286] "except"      "excuse"      "exercise"    "exist"       "expect"     
[291] "expense"     "experience"  "explain"     "express"     "extra"      
[296] "eye"         "face"        "fact"        "fair"        "fall"       
[301] "family"      "far"         "farm"        "fast"        "father"     
[306] "favour"      "feed"        "feel"        "few"         "field"      
[311] "fight"       "figure"      "file"        "fill"        "film"       
[316] "final"       "finance"     "find"        "fine"        "finish"     
[321] "fire"        "first"       "fish"        "fit"         "five"       
[326] "flat"        "floor"       "fly"         "follow"      "food"       
[331] "foot"        "for"         "force"       "forget"      "form"       
[336] "fortune"     "forward"     "four"        "france"      "free"       
[341] "friday"      "friend"      "from"        "front"       "full"       
[346] "fun"         "function"    "fund"        "further"     "future"     
[351] "game"        "garden"      "gas"         "general"     "germany"    
[356] "get"         "girl"        "give"        "glass"       "go"         
[361] "god"         "good"        "goodbye"     "govern"      "grand"      
[366] "grant"       "great"       "green"       "ground"      "group"      
[371] "grow"        "guess"       "guy"         "hair"        "half"       
[376] "hall"        "hand"        "hang"        "happen"      "happy"      
[381] "hard"        "hate"        "have"        "he"          "head"       
[386] "health"      "hear"        "heart"       "heat"        "heavy"      
[391] "hell"        "help"        "here"        "high"        "history"    
[396] "hit"         "hold"        "holiday"     "home"        "honest"     
[401] "hope"        "horse"       "hospital"    "hot"         "hour"       
[406] "house"       "how"         "however"     "hullo"       "hundred"    
[411] "husband"     "idea"        "identify"    "if"          "imagine"    
[416] "important"   "improve"     "in"          "include"     "income"     
[421] "increase"    "indeed"      "individual"  "industry"    "inform"     
[426] "inside"      "instead"     "insure"      "interest"    "into"       
[431] "introduce"   "invest"      "involve"     "issue"       "it"         
[436] "item"        "jesus"       "job"         "join"        "judge"      
[441] "jump"        "just"        "keep"        "key"         "kid"        
[446] "kill"        "kind"        "king"        "kitchen"     "knock"      
[451] "know"        "labour"      "lad"         "lady"        "land"       
[456] "language"    "large"       "last"        "late"        "laugh"      
[461] "law"         "lay"         "lead"        "learn"       "leave"      
[466] "left"        "leg"         "less"        "let"         "letter"     
[471] "level"       "lie"         "life"        "light"       "like"       
[476] "likely"      "limit"       "line"        "link"        "list"       
[481] "listen"      "little"      "live"        "load"        "local"      
[486] "lock"        "london"      "long"        "look"        "lord"       
[491] "lose"        "lot"         "love"        "low"         "luck"       
[496] "lunch"       "machine"     "main"        "major"       "make"       
[501] "man"         "manage"      "many"        "mark"        "market"     
[506] "marry"       "match"       "matter"      "may"         "maybe"      
[511] "mean"        "meaning"     "measure"     "meet"        "member"     
[516] "mention"     "middle"      "might"       "mile"        "milk"       
[521] "million"     "mind"        "minister"    "minus"       "minute"     
[526] "miss"        "mister"      "moment"      "monday"      "money"      
[531] "month"       "more"        "morning"     "most"        "mother"     
[536] "motion"      "move"        "mrs"         "much"        "music"      
[541] "must"        "name"        "nation"      "nature"      "near"       
[546] "necessary"   "need"        "never"       "new"         "news"       
[551] "next"        "nice"        "night"       "nine"        "no"         
[556] "non"         "none"        "normal"      "north"       "not"        
[561] "note"        "notice"      "now"         "number"      "obvious"    
[566] "occasion"    "odd"         "of"          "off"         "offer"      
[571] "office"      "often"       "okay"        "old"         "on"         
[576] "once"        "one"         "only"        "open"        "operate"    
[581] "opportunity" "oppose"      "or"          "order"       "organize"   
[586] "original"    "other"       "otherwise"   "ought"       "out"        
[591] "over"        "own"         "pack"        "page"        "paint"      
[596] "pair"        "paper"       "paragraph"   "pardon"      "parent"     
[601] "park"        "part"        "particular"  "party"       "pass"       
[606] "past"        "pay"         "pence"       "pension"     "people"     
[611] "per"         "percent"     "perfect"     "perhaps"     "period"     
[616] "person"      "photograph"  "pick"        "picture"     "piece"      
[621] "place"       "plan"        "play"        "please"      "plus"       
[626] "point"       "police"      "policy"      "politic"     "poor"       
[631] "position"    "positive"    "possible"    "post"        "pound"      
[636] "power"       "practise"    "prepare"     "present"     "press"      
[641] "pressure"    "presume"     "pretty"      "previous"    "price"      
[646] "print"       "private"     "probable"    "problem"     "proceed"    
[651] "process"     "produce"     "product"     "programme"   "project"    
[656] "proper"      "propose"     "protect"     "provide"     "public"     
[661] "pull"        "purpose"     "push"        "put"         "quality"    
[666] "quarter"     "question"    "quick"       "quid"        "quiet"      
[671] "quite"       "radio"       "rail"        "raise"       "range"      
[676] "rate"        "rather"      "read"        "ready"       "real"       
[681] "realise"     "really"      "reason"      "receive"     "recent"     
[686] "reckon"      "recognize"   "recommend"   "record"      "red"        
[691] "reduce"      "refer"       "regard"      "region"      "relation"   
[696] "remember"    "report"      "represent"   "require"     "research"   
[701] "resource"    "respect"     "responsible" "rest"        "result"     
[706] "return"      "rid"         "right"       "ring"        "rise"       
[711] "road"        "role"        "roll"        "room"        "round"      
[716] "rule"        "run"         "safe"        "sale"        "same"       
[721] "saturday"    "save"        "say"         "scheme"      "school"     
[726] "science"     "score"       "scotland"    "seat"        "second"     
[731] "secretary"   "section"     "secure"      "see"         "seem"       
[736] "self"        "sell"        "send"        "sense"       "separate"   
[741] "serious"     "serve"       "service"     "set"         "settle"     
[746] "seven"       "sex"         "shall"       "share"       "she"        
[751] "sheet"       "shoe"        "shoot"       "shop"        "short"      
[756] "should"      "show"        "shut"        "sick"        "side"       
[761] "sign"        "similar"     "simple"      "since"       "sing"       
[766] "single"      "sir"         "sister"      "sit"         "site"       
[771] "situate"     "six"         "size"        "sleep"       "slight"     
[776] "slow"        "small"       "smoke"       "so"          "social"     
[781] "society"     "some"        "son"         "soon"        "sorry"      
[786] "sort"        "sound"       "south"       "space"       "speak"      
[791] "special"     "specific"    "speed"       "spell"       "spend"      
[796] "square"      "staff"       "stage"       "stairs"      "stand"      
[801] "standard"    "start"       "state"       "station"     "stay"       
[806] "step"        "stick"       "still"       "stop"        "story"      
[811] "straight"    "strategy"    "street"      "strike"      "strong"     
[816] "structure"   "student"     "study"       "stuff"       "stupid"     
[821] "subject"     "succeed"     "such"        "sudden"      "suggest"    
[826] "suit"        "summer"      "sun"         "sunday"      "supply"     
[831] "support"     "suppose"     "sure"        "surprise"    "switch"     
[836] "system"      "table"       "take"        "talk"        "tape"       
[841] "tax"         "tea"         "teach"       "team"        "telephone"  
[846] "television"  "tell"        "ten"         "tend"        "term"       
[851] "terrible"    "test"        "than"        "thank"       "the"        
[856] "then"        "there"       "therefore"   "they"        "thing"      
[861] "think"       "thirteen"    "thirty"      "this"        "thou"       
[866] "though"      "thousand"    "three"       "through"     "throw"      
[871] "thursday"    "tie"         "time"        "to"          "today"      
[876] "together"    "tomorrow"    "tonight"     "too"         "top"        
[881] "total"       "touch"       "toward"      "town"        "trade"      
[886] "traffic"     "train"       "transport"   "travel"      "treat"      
[891] "tree"        "trouble"     "true"        "trust"       "try"        
[896] "tuesday"     "turn"        "twelve"      "twenty"      "two"        
[901] "type"        "under"       "understand"  "union"       "unit"       
[906] "unite"       "university"  "unless"      "until"       "up"         
[911] "upon"        "use"         "usual"       "value"       "various"    
[916] "very"        "video"       "view"        "village"     "visit"      
[921] "vote"        "wage"        "wait"        "walk"        "wall"       
[926] "want"        "war"         "warm"        "wash"        "waste"      
[931] "watch"       "water"       "way"         "we"          "wear"       
[936] "wednesday"   "wee"         "week"        "weigh"       "welcome"    
[941] "well"        "west"        "what"        "when"        "where"      
[946] "whether"     "which"       "while"       "white"       "who"        
[951] "whole"       "why"         "wide"        "wife"        "will"       
[956] "win"         "wind"        "window"      "wish"        "with"       
[961] "within"      "without"     "woman"       "wonder"      "wood"       
[966] "word"        "work"        "world"       "worry"       "worse"      
[971] "worth"       "would"       "write"       "wrong"       "year"       
[976] "yes"         "yesterday"   "yet"         "you"         "young"

Data we will look at:

babynames::babynames (use install.packages("babynames"))

babynames  # n is the total number of people of that sex with that name born in that year

# A tibble: 1,924,665 × 5
    year sex   name          n   prop
   <dbl> <chr> <chr>     <int>  <dbl>
 1  1880 F     Mary       7065 0.0724
 2  1880 F     Anna       2604 0.0267
 3  1880 F     Emma       2003 0.0205
 4  1880 F     Elizabeth  1939 0.0199
 5  1880 F     Minnie     1746 0.0179
 6  1880 F     Margaret   1578 0.0162
 7  1880 F     Ida        1472 0.0151
 8  1880 F     Alice      1414 0.0145
 9  1880 F     Bertha     1320 0.0135
10  1880 F     Sarah      1288 0.0132
# ℹ 1,924,655 more rows

Pattern matching

`str_view()`

str_view(string, pattern = NULL) will print the underlying representation of a string and to see how a pattern matches.

pattern will parse regular expressions (regex) and character classes

str_view(fruit, "berry")

 [6] │ bil<berry>
 [7] │ black<berry>
[10] │ blue<berry>
[11] │ boysen<berry>
[19] │ cloud<berry>
[21] │ cran<berry>
[29] │ elder<berry>
[32] │ goji <berry>
[33] │ goose<berry>
[38] │ huckle<berry>
[50] │ mul<berry>
[70] │ rasp<berry>
[73] │ salal <berry>
[76] │ straw<berry>

`str_detect()`

str_detect(string, pattern) returns TRUE if string contains pattern and FALSE otherwise.

str_detect(fruit, "berry")

 [1] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
...

Can be used with filter().

E.g., \(\#\) babies whose name contains “x”

babynames

# A tibble: 1,924,665 × 5
    year sex   name          n   prop
   <dbl> <chr> <chr>     <int>  <dbl>
 1  1880 F     Mary       7065 0.0724
 2  1880 F     Anna       2604 0.0267
 3  1880 F     Emma       2003 0.0205
...

babynames |> 
  filter(str_detect(name, "x")) |> 
  count(name, wt = n, sort = TRUE)

# A tibble: 974 × 2
   name            n
   <chr>       <int>
 1 Alexander  665492
 2 Alexis     399551
 3 Alex       278705
 4 Alexandra  232223
 5 Max        148787
...

`str_detect()`

You can also use str_detect() in conjunction with group_by(), summarize() etc.

sum() will return number of strings which have pattern
mean() will return proportion of strings which have pattern

E.g. proportion of names per year that have an “x”

babynames %>% 
  group_by(year) %>%
  summarize(prop_x = mean(str_detect(name, "x"))) %>%
  arrange(by = desc(prop_x))

# A tibble: 138 × 2
    year prop_x
   <dbl>  <dbl>
 1  2016 0.0163
 2  2017 0.0159
 3  2015 0.0154
 4  2014 0.0146
 5  2013 0.0145
 6  2012 0.0136
 7  2011 0.0130
 8  2010 0.0126
 9  2009 0.0118
10  2007 0.0108
# ℹ 128 more rows

`str_count()`

str_count(string, pattern = "") counts the number of times pattern is found within each element of string

x <- c("apple", "banana", "pear", "papaya")
str_count(x, "p")

[1] 2 0 1 2

Character classes

Defined by [], let you match from a set of characters (similar idea to %in%)

[abcd] matches anything with “a”, “b”, “c”, or “d”
Can invert by using ^: [^abcd] returns anything except “a”, “b”, “c”, “d”

e.g. any word containing “x” surrounded by vowels, or “y” surrounded by consonants

str_view(words, "[aeiou]x[aeoiu]")

[284] │ <exa>ct
[285] │ <exa>mple
[288] │ <exe>rcise
[289] │ <exi>st

str_view(words, "[^aeiou]y[^aeiou]")

[836] │ <sys>tem
[901] │ <typ>e

Character classes

Defined by [], let you match from a set of characters (similar idea to %in%)

[abcd] matches anything with “a”, “b”, “c”, or “d”
Can invert by using ^: [^abcd] returns anything except “a”, “b”, “c”, “d”

alternation | picks between alternative patterns, e.g. words containing “apple”, “melon”, or “nut”; repeated vowels

str_view(fruit, "apple|nut|melon")

 [1] │ <apple>
[13] │ canary <melon>
[20] │ coco<nut>
[52] │ <nut>
[62] │ pine<apple>
[72] │ rock <melon>
[80] │ water<melon>

str_view(fruit, "aa|ee|ii|oo|uu")

 [9] │ bl<oo>d orange
[33] │ g<oo>seberry
[47] │ lych<ee>
[66] │ purple mangost<ee>n

Counting vowels and constants in baby names

Can use str_count() with mutate, i.e. computing number of vowels/consonants in baby names:

babynames %>%
  count(name) %>%
  mutate(
    vowels = str_count(name, "[aeiou]"),  # pattern matching is case sensitive, so "A" isn't counted.
    consonants = str_count(name, "[^aeiou]")
  )

# A tibble: 97,310 × 4
   name          n vowels consonants
   <chr>     <int>  <int>      <int>
 1 Aaban        10      2          3
 2 Aabha         5      2          3
 3 Aabid         2      2          3
 4 Aabir         1      2          3
 5 Aabriella     5      4          5
 6 Aada          1      2          2
 7 Aadam        26      2          3
 8 Aadan        11      2          3
 9 Aadarsh      17      2          5
10 Aaden        18      2          3
# ℹ 97,300 more rows

Counting vowels and constants in baby names

Pattern matching is case sensitive, so “A” isn’t counted. Two ways around this:

Add the upper case vowels to the character class: str_count(name, "[aeiouAEIOU]")

babynames %>% 
  count(name) %>% 
  mutate(
    vowels = str_count(name, "[aeiouAEIOU]"),
    consonants = str_count(name, "[^aeiouAEIOU]"))

# A tibble: 97,310 × 4
   name          n vowels consonants
   <chr>     <int>  <int>      <int>
 1 Aaban        10      3          2
 2 Aabha         5      3          2
 3 Aabid         2      3          2
 4 Aabir         1      3          2
 5 Aabriella     5      5          4
 6 Aada          1      3          1
 7 Aadam        26      3          2
 8 Aadan        11      3          2
 9 Aadarsh      17      3          4
10 Aaden        18      3          2
# ℹ 97,300 more rows

Convert the names to lower case: str_count(str_to_lower(name), "[aeiou]")

babynames %>% 
  count(name) %>% 
  mutate(
    name = str_to_lower(name),
    vowels = str_count(name, "[aeiou]"),
    consonants = str_count(name, "[^aeiou]"))

# A tibble: 97,310 × 4
   name          n vowels consonants
   <chr>     <int>  <int>      <int>
 1 aaban        10      3          2
 2 aabha         5      3          2
 3 aabid         2      3          2
 4 aabir         1      3          2
 5 aabriella     5      5          4
 6 aada          1      3          1
 7 aadam        26      3          2
 8 aadan        11      3          2
 9 aadarsh      17      3          4
10 aaden        18      3          2
# ℹ 97,300 more rows

Replacing and removing values

str_replace(): replaces first match
str_replace_all() replace all matches

x <- c("apple", "pear", "banana")
(str_replace(x, "[aeiou]", "-"))

[1] "-pple"  "p-ar"   "b-nana"

str_replace_all(x, "[aeiou]", "-")

[1] "-ppl-"  "p--r"   "b-n-n-"

You can remove patterns if you set replacement with ““, or using str_remove() / str_remove_all()

str_remove(x, "[aeiou]")

[1] "pple"  "par"   "bnana"

str_remove_all(x, "[aeiou]")

[1] "ppl" "pr"  "bnn"

Literal characters, metacharacters

More complicated and powerful patterns can be specified using metacharacters

Punctuation characters are typically metacharacters: have special regex meanings (., +, *, etc)
In contrast, letters and numbers which match exactly are literal characters
Due to time constraints, will not cover these. Can read Ch 15 of R4DS2.
Since these metacharacters have extra meanings in regex, need to use escapes to help parse literal instances of these characters
In regex, we require a \ in front of characters to denote an escape
But to create a string with an actual \ in it, we need to use an escape, so need double \\:

str_view(c("abc", "a.c", "bef", ".rri"), "\\.")

[2] │ a<.>c
[4] │ <.>rri