06: Pattern matching in strings

STA35B: Statistical Data Science 2

Akira Horiguchi

Data we will look at:

stringr::fruit: 80 fruits

fruit
 [1] "apple"             "apricot"           "avocado"          
 [4] "banana"            "bell pepper"       "bilberry"         
 [7] "blackberry"        "blackcurrant"      "blood orange"     
[10] "blueberry"         "boysenberry"       "breadfruit"       
[13] "canary melon"      "cantaloupe"        "cherimoya"        
[16] "cherry"            "chili pepper"      "clementine"       
[19] "cloudberry"        "coconut"           "cranberry"        
[22] "cucumber"          "currant"           "damson"           
[25] "date"              "dragonfruit"       "durian"           
[28] "eggplant"          "elderberry"        "feijoa"           
[31] "fig"               "goji berry"        "gooseberry"       
[34] "grape"             "grapefruit"        "guava"            
[37] "honeydew"          "huckleberry"       "jackfruit"        
[40] "jambul"            "jujube"            "kiwi fruit"       
[43] "kumquat"           "lemon"             "lime"             
[46] "loquat"            "lychee"            "mandarine"        
[49] "mango"             "mulberry"          "nectarine"        
[52] "nut"               "olive"             "orange"           
[55] "pamelo"            "papaya"            "passionfruit"     
[58] "peach"             "pear"              "persimmon"        
[61] "physalis"          "pineapple"         "plum"             
[64] "pomegranate"       "pomelo"            "purple mangosteen"
[67] "quince"            "raisin"            "rambutan"         
[70] "raspberry"         "redcurrant"        "rock melon"       
[73] "salal berry"       "satsuma"           "star fruit"       
[76] "strawberry"        "tamarillo"         "tangerine"        
[79] "ugli fruit"        "watermelon"       

stringr::words: 980 common English words

words
  [1] "a"           "able"        "about"       "absolute"    "accept"     
  [6] "account"     "achieve"     "across"      "act"         "active"     
 [11] "actual"      "add"         "address"     "admit"       "advertise"  
 [16] "affect"      "afford"      "after"       "afternoon"   "again"      
 [21] "against"     "age"         "agent"       "ago"         "agree"      
 [26] "air"         "all"         "allow"       "almost"      "along"      
 [31] "already"     "alright"     "also"        "although"    "always"     
 [36] "america"     "amount"      "and"         "another"     "answer"     
 [41] "any"         "apart"       "apparent"    "appear"      "apply"      
 [46] "appoint"     "approach"    "appropriate" "area"        "argue"      
 [51] "arm"         "around"      "arrange"     "art"         "as"         
 [56] "ask"         "associate"   "assume"      "at"          "attend"     
 [61] "authority"   "available"   "aware"       "away"        "awful"      
 [66] "baby"        "back"        "bad"         "bag"         "balance"    
 [71] "ball"        "bank"        "bar"         "base"        "basis"      
 [76] "be"          "bear"        "beat"        "beauty"      "because"    
 [81] "become"      "bed"         "before"      "begin"       "behind"     
 [86] "believe"     "benefit"     "best"        "bet"         "between"    
 [91] "big"         "bill"        "birth"       "bit"         "black"      
 [96] "bloke"       "blood"       "blow"        "blue"        "board"      
[101] "boat"        "body"        "book"        "both"        "bother"     
[106] "bottle"      "bottom"      "box"         "boy"         "break"      
[111] "brief"       "brilliant"   "bring"       "britain"     "brother"    
[116] "budget"      "build"       "bus"         "business"    "busy"       
[121] "but"         "buy"         "by"          "cake"        "call"       
[126] "can"         "car"         "card"        "care"        "carry"      
[131] "case"        "cat"         "catch"       "cause"       "cent"       
[136] "centre"      "certain"     "chair"       "chairman"    "chance"     
[141] "change"      "chap"        "character"   "charge"      "cheap"      
[146] "check"       "child"       "choice"      "choose"      "Christ"     
[151] "Christmas"   "church"      "city"        "claim"       "class"      
[156] "clean"       "clear"       "client"      "clock"       "close"      
[161] "closes"      "clothe"      "club"        "coffee"      "cold"       
[166] "colleague"   "collect"     "college"     "colour"      "come"       
[171] "comment"     "commit"      "committee"   "common"      "community"  
[176] "company"     "compare"     "complete"    "compute"     "concern"    
[181] "condition"   "confer"      "consider"    "consult"     "contact"    
[186] "continue"    "contract"    "control"     "converse"    "cook"       
[191] "copy"        "corner"      "correct"     "cost"        "could"      
[196] "council"     "count"       "country"     "county"      "couple"     
[201] "course"      "court"       "cover"       "create"      "cross"      
[206] "cup"         "current"     "cut"         "dad"         "danger"     
[211] "date"        "day"         "dead"        "deal"        "dear"       
[216] "debate"      "decide"      "decision"    "deep"        "definite"   
[221] "degree"      "department"  "depend"      "describe"    "design"     
[226] "detail"      "develop"     "die"         "difference"  "difficult"  
[231] "dinner"      "direct"      "discuss"     "district"    "divide"     
[236] "do"          "doctor"      "document"    "dog"         "door"       
[241] "double"      "doubt"       "down"        "draw"        "dress"      
[246] "drink"       "drive"       "drop"        "dry"         "due"        
[251] "during"      "each"        "early"       "east"        "easy"       
[256] "eat"         "economy"     "educate"     "effect"      "egg"        
[261] "eight"       "either"      "elect"       "electric"    "eleven"     
[266] "else"        "employ"      "encourage"   "end"         "engine"     
[271] "english"     "enjoy"       "enough"      "enter"       "environment"
[276] "equal"       "especial"    "europe"      "even"        "evening"    
[281] "ever"        "every"       "evidence"    "exact"       "example"    
[286] "except"      "excuse"      "exercise"    "exist"       "expect"     
[291] "expense"     "experience"  "explain"     "express"     "extra"      
[296] "eye"         "face"        "fact"        "fair"        "fall"       
[301] "family"      "far"         "farm"        "fast"        "father"     
[306] "favour"      "feed"        "feel"        "few"         "field"      
[311] "fight"       "figure"      "file"        "fill"        "film"       
[316] "final"       "finance"     "find"        "fine"        "finish"     
[321] "fire"        "first"       "fish"        "fit"         "five"       
[326] "flat"        "floor"       "fly"         "follow"      "food"       
[331] "foot"        "for"         "force"       "forget"      "form"       
[336] "fortune"     "forward"     "four"        "france"      "free"       
[341] "friday"      "friend"      "from"        "front"       "full"       
[346] "fun"         "function"    "fund"        "further"     "future"     
[351] "game"        "garden"      "gas"         "general"     "germany"    
[356] "get"         "girl"        "give"        "glass"       "go"         
[361] "god"         "good"        "goodbye"     "govern"      "grand"      
[366] "grant"       "great"       "green"       "ground"      "group"      
[371] "grow"        "guess"       "guy"         "hair"        "half"       
[376] "hall"        "hand"        "hang"        "happen"      "happy"      
[381] "hard"        "hate"        "have"        "he"          "head"       
[386] "health"      "hear"        "heart"       "heat"        "heavy"      
[391] "hell"        "help"        "here"        "high"        "history"    
[396] "hit"         "hold"        "holiday"     "home"        "honest"     
[401] "hope"        "horse"       "hospital"    "hot"         "hour"       
[406] "house"       "how"         "however"     "hullo"       "hundred"    
[411] "husband"     "idea"        "identify"    "if"          "imagine"    
[416] "important"   "improve"     "in"          "include"     "income"     
[421] "increase"    "indeed"      "individual"  "industry"    "inform"     
[426] "inside"      "instead"     "insure"      "interest"    "into"       
[431] "introduce"   "invest"      "involve"     "issue"       "it"         
[436] "item"        "jesus"       "job"         "join"        "judge"      
[441] "jump"        "just"        "keep"        "key"         "kid"        
[446] "kill"        "kind"        "king"        "kitchen"     "knock"      
[451] "know"        "labour"      "lad"         "lady"        "land"       
[456] "language"    "large"       "last"        "late"        "laugh"      
[461] "law"         "lay"         "lead"        "learn"       "leave"      
[466] "left"        "leg"         "less"        "let"         "letter"     
[471] "level"       "lie"         "life"        "light"       "like"       
[476] "likely"      "limit"       "line"        "link"        "list"       
[481] "listen"      "little"      "live"        "load"        "local"      
[486] "lock"        "london"      "long"        "look"        "lord"       
[491] "lose"        "lot"         "love"        "low"         "luck"       
[496] "lunch"       "machine"     "main"        "major"       "make"       
[501] "man"         "manage"      "many"        "mark"        "market"     
[506] "marry"       "match"       "matter"      "may"         "maybe"      
[511] "mean"        "meaning"     "measure"     "meet"        "member"     
[516] "mention"     "middle"      "might"       "mile"        "milk"       
[521] "million"     "mind"        "minister"    "minus"       "minute"     
[526] "miss"        "mister"      "moment"      "monday"      "money"      
[531] "month"       "more"        "morning"     "most"        "mother"     
[536] "motion"      "move"        "mrs"         "much"        "music"      
[541] "must"        "name"        "nation"      "nature"      "near"       
[546] "necessary"   "need"        "never"       "new"         "news"       
[551] "next"        "nice"        "night"       "nine"        "no"         
[556] "non"         "none"        "normal"      "north"       "not"        
[561] "note"        "notice"      "now"         "number"      "obvious"    
[566] "occasion"    "odd"         "of"          "off"         "offer"      
[571] "office"      "often"       "okay"        "old"         "on"         
[576] "once"        "one"         "only"        "open"        "operate"    
[581] "opportunity" "oppose"      "or"          "order"       "organize"   
[586] "original"    "other"       "otherwise"   "ought"       "out"        
[591] "over"        "own"         "pack"        "page"        "paint"      
[596] "pair"        "paper"       "paragraph"   "pardon"      "parent"     
[601] "park"        "part"        "particular"  "party"       "pass"       
[606] "past"        "pay"         "pence"       "pension"     "people"     
[611] "per"         "percent"     "perfect"     "perhaps"     "period"     
[616] "person"      "photograph"  "pick"        "picture"     "piece"      
[621] "place"       "plan"        "play"        "please"      "plus"       
[626] "point"       "police"      "policy"      "politic"     "poor"       
[631] "position"    "positive"    "possible"    "post"        "pound"      
[636] "power"       "practise"    "prepare"     "present"     "press"      
[641] "pressure"    "presume"     "pretty"      "previous"    "price"      
[646] "print"       "private"     "probable"    "problem"     "proceed"    
[651] "process"     "produce"     "product"     "programme"   "project"    
[656] "proper"      "propose"     "protect"     "provide"     "public"     
[661] "pull"        "purpose"     "push"        "put"         "quality"    
[666] "quarter"     "question"    "quick"       "quid"        "quiet"      
[671] "quite"       "radio"       "rail"        "raise"       "range"      
[676] "rate"        "rather"      "read"        "ready"       "real"       
[681] "realise"     "really"      "reason"      "receive"     "recent"     
[686] "reckon"      "recognize"   "recommend"   "record"      "red"        
[691] "reduce"      "refer"       "regard"      "region"      "relation"   
[696] "remember"    "report"      "represent"   "require"     "research"   
[701] "resource"    "respect"     "responsible" "rest"        "result"     
[706] "return"      "rid"         "right"       "ring"        "rise"       
[711] "road"        "role"        "roll"        "room"        "round"      
[716] "rule"        "run"         "safe"        "sale"        "same"       
[721] "saturday"    "save"        "say"         "scheme"      "school"     
[726] "science"     "score"       "scotland"    "seat"        "second"     
[731] "secretary"   "section"     "secure"      "see"         "seem"       
[736] "self"        "sell"        "send"        "sense"       "separate"   
[741] "serious"     "serve"       "service"     "set"         "settle"     
[746] "seven"       "sex"         "shall"       "share"       "she"        
[751] "sheet"       "shoe"        "shoot"       "shop"        "short"      
[756] "should"      "show"        "shut"        "sick"        "side"       
[761] "sign"        "similar"     "simple"      "since"       "sing"       
[766] "single"      "sir"         "sister"      "sit"         "site"       
[771] "situate"     "six"         "size"        "sleep"       "slight"     
[776] "slow"        "small"       "smoke"       "so"          "social"     
[781] "society"     "some"        "son"         "soon"        "sorry"      
[786] "sort"        "sound"       "south"       "space"       "speak"      
[791] "special"     "specific"    "speed"       "spell"       "spend"      
[796] "square"      "staff"       "stage"       "stairs"      "stand"      
[801] "standard"    "start"       "state"       "station"     "stay"       
[806] "step"        "stick"       "still"       "stop"        "story"      
[811] "straight"    "strategy"    "street"      "strike"      "strong"     
[816] "structure"   "student"     "study"       "stuff"       "stupid"     
[821] "subject"     "succeed"     "such"        "sudden"      "suggest"    
[826] "suit"        "summer"      "sun"         "sunday"      "supply"     
[831] "support"     "suppose"     "sure"        "surprise"    "switch"     
[836] "system"      "table"       "take"        "talk"        "tape"       
[841] "tax"         "tea"         "teach"       "team"        "telephone"  
[846] "television"  "tell"        "ten"         "tend"        "term"       
[851] "terrible"    "test"        "than"        "thank"       "the"        
[856] "then"        "there"       "therefore"   "they"        "thing"      
[861] "think"       "thirteen"    "thirty"      "this"        "thou"       
[866] "though"      "thousand"    "three"       "through"     "throw"      
[871] "thursday"    "tie"         "time"        "to"          "today"      
[876] "together"    "tomorrow"    "tonight"     "too"         "top"        
[881] "total"       "touch"       "toward"      "town"        "trade"      
[886] "traffic"     "train"       "transport"   "travel"      "treat"      
[891] "tree"        "trouble"     "true"        "trust"       "try"        
[896] "tuesday"     "turn"        "twelve"      "twenty"      "two"        
[901] "type"        "under"       "understand"  "union"       "unit"       
[906] "unite"       "university"  "unless"      "until"       "up"         
[911] "upon"        "use"         "usual"       "value"       "various"    
[916] "very"        "video"       "view"        "village"     "visit"      
[921] "vote"        "wage"        "wait"        "walk"        "wall"       
[926] "want"        "war"         "warm"        "wash"        "waste"      
[931] "watch"       "water"       "way"         "we"          "wear"       
[936] "wednesday"   "wee"         "week"        "weigh"       "welcome"    
[941] "well"        "west"        "what"        "when"        "where"      
[946] "whether"     "which"       "while"       "white"       "who"        
[951] "whole"       "why"         "wide"        "wife"        "will"       
[956] "win"         "wind"        "window"      "wish"        "with"       
[961] "within"      "without"     "woman"       "wonder"      "wood"       
[966] "word"        "work"        "world"       "worry"       "worse"      
[971] "worth"       "would"       "write"       "wrong"       "year"       
[976] "yes"         "yesterday"   "yet"         "you"         "young"      

Data we will look at:

babynames::babynames (use install.packages("babynames"))

babynames  # n is the total number of people of that sex with that name born in that year
# A tibble: 1,924,665 × 5
    year sex   name          n   prop
   <dbl> <chr> <chr>     <int>  <dbl>
 1  1880 F     Mary       7065 0.0724
 2  1880 F     Anna       2604 0.0267
 3  1880 F     Emma       2003 0.0205
 4  1880 F     Elizabeth  1939 0.0199
 5  1880 F     Minnie     1746 0.0179
 6  1880 F     Margaret   1578 0.0162
 7  1880 F     Ida        1472 0.0151
 8  1880 F     Alice      1414 0.0145
 9  1880 F     Bertha     1320 0.0135
10  1880 F     Sarah      1288 0.0132
# ℹ 1,924,655 more rows

Pattern matching

str_view()

str_view(string, pattern = NULL) will print the underlying representation of a string and to see how a pattern matches.

  • pattern will parse regular expressions (regex) and character classes
str_view(fruit, "berry")
 [6] │ bil<berry>
 [7] │ black<berry>
[10] │ blue<berry>
[11] │ boysen<berry>
[19] │ cloud<berry>
[21] │ cran<berry>
[29] │ elder<berry>
[32] │ goji <berry>
[33] │ goose<berry>
[38] │ huckle<berry>
[50] │ mul<berry>
[70] │ rasp<berry>
[73] │ salal <berry>
[76] │ straw<berry>

str_detect()

str_detect(string, pattern) returns TRUE if string contains pattern and FALSE otherwise.

str_detect(fruit, "berry")
 [1] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
...

Can be used with filter().

E.g., \(\#\) babies whose name contains “x”

babynames
# A tibble: 1,924,665 × 5
    year sex   name          n   prop
   <dbl> <chr> <chr>     <int>  <dbl>
 1  1880 F     Mary       7065 0.0724
 2  1880 F     Anna       2604 0.0267
 3  1880 F     Emma       2003 0.0205
...
babynames |> 
  filter(str_detect(name, "x")) |> 
  count(name, wt = n, sort = TRUE)
# A tibble: 974 × 2
   name            n
   <chr>       <int>
 1 Alexander  665492
 2 Alexis     399551
 3 Alex       278705
 4 Alexandra  232223
 5 Max        148787
...

str_detect()

You can also use str_detect() in conjunction with group_by(), summarize() etc.

  • sum() will return number of strings which have pattern
  • mean() will return proportion of strings which have pattern

E.g. proportion of names per year that have an “x”

babynames %>% 
  group_by(year) %>%
  summarize(prop_x = mean(str_detect(name, "x"))) %>%
  arrange(by = desc(prop_x))
# A tibble: 138 × 2
    year prop_x
   <dbl>  <dbl>
 1  2016 0.0163
 2  2017 0.0159
 3  2015 0.0154
 4  2014 0.0146
 5  2013 0.0145
 6  2012 0.0136
 7  2011 0.0130
 8  2010 0.0126
 9  2009 0.0118
10  2007 0.0108
# ℹ 128 more rows

str_count()

str_count(string, pattern = "") counts the number of times pattern is found within each element of string

x <- c("apple", "banana", "pear", "papaya")
str_count(x, "p")
[1] 2 0 1 2

Character classes

Defined by [], let you match from a set of characters (similar idea to %in%)

  • [abcd] matches anything with “a”, “b”, “c”, or “d”
  • Can invert by using ^: [^abcd] returns anything except “a”, “b”, “c”, “d”

e.g. any word containing “x” surrounded by vowels, or “y” surrounded by consonants

str_view(words, "[aeiou]x[aeoiu]")
[284] │ <exa>ct
[285] │ <exa>mple
[288] │ <exe>rcise
[289] │ <exi>st
str_view(words, "[^aeiou]y[^aeiou]")
[836] │ <sys>tem
[901] │ <typ>e

Character classes

Defined by [], let you match from a set of characters (similar idea to %in%)

  • [abcd] matches anything with “a”, “b”, “c”, or “d”
  • Can invert by using ^: [^abcd] returns anything except “a”, “b”, “c”, “d”

alternation | picks between alternative patterns, e.g. words containing “apple”, “melon”, or “nut”; repeated vowels

str_view(fruit, "apple|nut|melon")
 [1] │ <apple>
[13] │ canary <melon>
[20] │ coco<nut>
[52] │ <nut>
[62] │ pine<apple>
[72] │ rock <melon>
[80] │ water<melon>
str_view(fruit, "aa|ee|ii|oo|uu")
 [9] │ bl<oo>d orange
[33] │ g<oo>seberry
[47] │ lych<ee>
[66] │ purple mangost<ee>n

Counting vowels and constants in baby names

Can use str_count() with mutate, i.e. computing number of vowels/consonants in baby names:

babynames %>%
  count(name) %>%
  mutate(
    vowels = str_count(name, "[aeiou]"),  # pattern matching is case sensitive, so "A" isn't counted.
    consonants = str_count(name, "[^aeiou]")
  )
# A tibble: 97,310 × 4
   name          n vowels consonants
   <chr>     <int>  <int>      <int>
 1 Aaban        10      2          3
 2 Aabha         5      2          3
 3 Aabid         2      2          3
 4 Aabir         1      2          3
 5 Aabriella     5      4          5
 6 Aada          1      2          2
 7 Aadam        26      2          3
 8 Aadan        11      2          3
 9 Aadarsh      17      2          5
10 Aaden        18      2          3
# ℹ 97,300 more rows

Counting vowels and constants in baby names

Pattern matching is case sensitive, so “A” isn’t counted. Two ways around this:

  1. Add the upper case vowels to the character class: str_count(name, "[aeiouAEIOU]")
babynames %>% 
  count(name) %>% 
  mutate(
    vowels = str_count(name, "[aeiouAEIOU]"),
    consonants = str_count(name, "[^aeiouAEIOU]"))
# A tibble: 97,310 × 4
   name          n vowels consonants
   <chr>     <int>  <int>      <int>
 1 Aaban        10      3          2
 2 Aabha         5      3          2
 3 Aabid         2      3          2
 4 Aabir         1      3          2
 5 Aabriella     5      5          4
 6 Aada          1      3          1
 7 Aadam        26      3          2
 8 Aadan        11      3          2
 9 Aadarsh      17      3          4
10 Aaden        18      3          2
# ℹ 97,300 more rows
  1. Convert the names to lower case: str_count(str_to_lower(name), "[aeiou]")
babynames %>% 
  count(name) %>% 
  mutate(
    name = str_to_lower(name),
    vowels = str_count(name, "[aeiou]"),
    consonants = str_count(name, "[^aeiou]"))
# A tibble: 97,310 × 4
   name          n vowels consonants
   <chr>     <int>  <int>      <int>
 1 aaban        10      3          2
 2 aabha         5      3          2
 3 aabid         2      3          2
 4 aabir         1      3          2
 5 aabriella     5      5          4
 6 aada          1      3          1
 7 aadam        26      3          2
 8 aadan        11      3          2
 9 aadarsh      17      3          4
10 aaden        18      3          2
# ℹ 97,300 more rows

Replacing and removing values

  • str_replace(): replaces first match
  • str_replace_all() replace all matches
x <- c("apple", "pear", "banana")
(str_replace(x, "[aeiou]", "-"))
[1] "-pple"  "p-ar"   "b-nana"
str_replace_all(x, "[aeiou]", "-")
[1] "-ppl-"  "p--r"   "b-n-n-"
  • You can remove patterns if you set replacement with ““, or using str_remove() / str_remove_all()
str_remove(x, "[aeiou]")
[1] "pple"  "par"   "bnana"
str_remove_all(x, "[aeiou]")
[1] "ppl" "pr"  "bnn"

Literal characters, metacharacters

More complicated and powerful patterns can be specified using metacharacters

  • Punctuation characters are typically metacharacters: have special regex meanings (., +, *, etc)
  • In contrast, letters and numbers which match exactly are literal characters
  • Due to time constraints, will not cover these. Can read Ch 15 of R4DS2.
  • Since these metacharacters have extra meanings in regex, need to use escapes to help parse literal instances of these characters
  • In regex, we require a \ in front of characters to denote an escape
  • But to create a string with an actual \ in it, we need to use an escape, so need double \\:
str_view(c("abc", "a.c", "bef", ".rri"), "\\.")
[2] │ a<.>c
[4] │ <.>rri