I am a VBA user who started studying machine learning. As a memorandum, I would like to summarize the Python / R grammar while comparing it with VBA.
table of contents
In the previous Article, I compared Python and R string operations with VBA, but in the continuation, I will try using the R string operation package stringr
.
Reference article states that "character string processing is possible even with the functions provided by the R standard base
package, but stringr
is easier to use with rational behavior for a unified interface." , When you actually use it, it is certainly impressive and easy to use because the function naming and the order of the arguments are uniform.
Reference: stringr — Process R strings in a decent way
R(stringr)
R
library(stringr)
s1 <- "abc"
s2 <- "def"
s3 <- "ghij"
str_c(s1, s2, s3)
# "abcdefghij"
R(stringr)
R
s <- "abcdefghij"
str_length(s)
# 10
R(stringr)
R
s <- "abcdefghij"
str_sub(s, 1, 2)
# "ab"
str_sub(s, -2, -1)
# "ij"
str_sub(s, 4, 6)
# "def"
R(stringr)
R
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_detect(s, "def")
# TRUE
str_detect(t, "def")
# TRUE
str_count(s, "def")
# 1
str_count(t, "def")
# 2
str_locate(s, "def")
# start end
# [1,] 4 6
str_locate(t, "def")
# start end
# [1,] 4 6
class(str_locate(t, "def"))
# "matrix"
str_locate_all(t, "def")
# [[1]]
# start end
# [1,] 4 6
# [2,] 14 16
class(str_locate_all(t, "def"))
# "list"
R(stringr)
R
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_replace(s, "def", "DEF")
# "abcDEFghij"
str_replace(t, "def", "DEF")
# "abcDEFghijabcdefghij"
str_replace_all(t, "def", "DEF")
# "abcDEFghijabcDEFghij"
R(stringr)
R
s <- "abcDEFghij"
str_to_upper(s) #Uppercase
# "ABCDEFGHIJ"
str_to_lower(s) #To lowercase
# "abcdefghij"
str_to_title(s) #Uppercase only at the beginning, lowercase otherwise
# "abcdefghij"
str_to_sentence(s) #Uppercase only at the beginning, lowercase otherwise
# "Abcdefghij"
ss <- "abc def ghij"
str_to_title(ss)
# "Abc Def Ghij"
str_to_sentence(ss)
# "Abc def ghij"
t <- ""
for (i in 1:str_length(s)) {
stemp = str_sub(s,i,i)
if (stemp == str_to_lower(stemp)) {
stemp = str_to_upper(stemp)
} else if (stemp == str_to_upper(stemp)) {
stemp = str_to_lower(stemp)
}
t <- str_c(t, stemp)
}
t #Swapping uppercase and lowercase letters
# "ABCdefGHIJ"
s == str_to_upper(s) #Judgment of all uppercase letters
# FALSE
s == str_to_lower(s) #Judgment of all lowercase letters
# FALSE
R(stringr)
R
R(stringr)
R
s <- "abcdefghij"
t <- ""
for (i in 1:str_length(s)) {
t <- str_c(t, str_sub(s, -i, -i))
}
t
# "jihgfedcba"
R(stringr)
R
str_dup("A", 3)
# "AAA"
str_dup("def", 3)
# "defdefdef"
R(stringr)
R
str_c("-", str_dup(" ", 3), "-")
# "- -"
# "- -"
s <- str_c(str_dup(" ", 2), "d",
str_dup(" ", 3), "e",
str_dup(" ", 4), "f",
str_dup(" ", 5))
str_c("-", s, "-")
# "- d e f -"
R(stringr)
R
str_trim(s, side="left")
# "d e f "
str_trim(s, side="right")
# " d e f"
str_trim(s, side="both")
# "d e f"
The functions in the stringr
package can be used not only for strings (single strings), but also for strings and data frames.
For example, if you use the str_length
function on a string vector consisting of 3 strings, a vector consisting of 3 numbers will be returned as a result of using the str_length
function for each element string.
R(stringr)
R
s1 <- "abcdefghij"
s2 <- "cdefghijkl"
s3 <- "efghijklmn"
ss <- c(s1, s2, s3)
ss
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_c(ss, "_1")
# [1] "abcdefghij_1" "cdefghijkl_1" "efghijklmn_1"
str_length(ss)
# [1] 10 10 10
str_sub(ss, 1, 2)
# [1] "ab" "cd" "ef"
str_sub(ss, -2, -1)
# [1] "ij" "kl" "mn"
str_sub(ss, 2, 3)
# [1] "bc" "de" "fg"
str_detect(ss, "def")
# [1] TRUE TRUE FALSE
str_count(ss, "def")
# [1] 1 1 0
str_locate(ss, "def")
# start end
# [1,] 4 6
# [2,] 2 4
# [3,] NA NA
str_locate_all(ss, "def")
# [[1]]
# start end
# [1,] 4 6
#
# [[2]]
# start end
# [1,] 2 4
#
# [[3]]
# start end
#
str_replace(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_replace_all(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_to_upper(ss)
# [1] "ABCDEFGHIJ" "CDEFGHIJKL" "EFGHIJKLMN"
str_to_lower(ss)
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_to_title(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
str_to_sentence(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
ss == str_to_upper(ss)
# [1] FALSE FALSE FALSE
ss == str_to_lower(ss)
# [1] TRUE TRUE TRUE
str_dup(ss, 2)
# [1] "abcdefghijabcdefghij" "cdefghijklcdefghijkl" "efghijklmnefghijklmn"
tt <- str_c(" ", ss, " _1 ")
tt
# [1] " abcdefghij _1 " " cdefghijkl _1 " " efghijklmn _1 "
str_trim(tt)
# [1] "abcdefghij _1" "cdefghijkl _1" "efghijklmn _1"
str_trim(tt, side="left")
# [1] "abcdefghij _1 " "cdefghijkl _1 " "efghijklmn _1 "
str_trim(tt, side="right")
# [1] " abcdefghij _1" " cdefghijkl _1" " efghijklmn _1"
I would like to summarize vectors and data frames in another article.
List the character string manipulation functions used in each language. For comparison, the calculation in EXCEL is also shown. s1 = "abc" s2 = "def" s3 = "ghij" s = "abcdefghij" t = "abcdefghijabcdefghij" u = "abcDEFghij" v = "abcDEFghij" w = " d e f " will do. Also, in each EXCEL cell A1 cell: = "abc" Cell A2: = "def" A3 cell: = "ghij" A4 cell: = "abcdefghij" Cell A5: = "abcdefghijabcdefghij" Cell A6: = "abcDEFghij" Cell A7: = "abcDEFghij" Cell A8: = "d e f" Is entered.
Python | R | R(stringr) | VBA | EXCEL | result | |
---|---|---|---|---|---|---|
Join | s1 + s2 + s3 | paste0(s1, s2, s3) paste(s1, s2, s3, sep="") |
str_c(s1, s2, s3) | s1 & s2 & s3 | =A1&A2&A3 =CONCATENATE( A1,A2,A3) |
abcdefghij |
length | len(s) | nchar(s) | str_length(s) | Len(s) | =LEN(A4) | 10 |
Inversion | s[::-1] | StrReverse(s) | jihgfedcba | |||
repetition | 'A' * 3 | str_dup("A", 3) | String(3, "A") | =REPT("A",3) | AAA | |
repetition | 'def' * 3 | str_dup("def", 3) | =REPT("def",3) | defdefdef |
Python | R | R(stringr) | VBA | EXCEL | result | |
---|---|---|---|---|---|---|
From the left | s[8:10] s[0:2] s[:2] |
substr(s, 1, 2) substring(s, 1, 2) |
str_sub(s, 1, 2) | Left(s, 2) | =LEFT(A4,2) | ab |
From the right | s[len(s)-2:len(s)] s[-2:] |
substr(s, nchar(s)-2+1, nchar(s)) |
str_sub(s, -2, -1) | Right(s, 2) | =RIGHT(A4,2) | ij |
On the way | s[3:6] | substr(s, 4, 6) | str_sub(s, 4, 6) | Mid(s, 4, 3) | =MID(A4,4,3) | def |
Note) Regarding the extraction of the character string "in the middle", the Python and R functions specify "where to where", but the VBA and EXCEL functions specify "where and how many characters".
Python | R | R(stringr) | VBA | EXCEL | result | |
---|---|---|---|---|---|---|
Search | s.find('def') | str_locate(s, "def") | InStr(1, s, "def") | =FIND("def",A4,1) =SEARCH("def",A4,1) |
3,4 | |
Search from behind | t.rfind('def') | InStrRev(t, "def") | 13,14 | |||
count | t.count('def') | str_count(t, "def") | 2 |
Note) See above for the str_detect
and str_locate
functions.
Python | R | R(stringr) | VBA | EXCEL | result | |
---|---|---|---|---|---|---|
Replacement | s.replace('def', 'DEF') | sub("def", "DEF", s) | str_replace(s, "def", "DEF") | Replace(s, "def", "DEF") | =SUBSTITUTE( A4,"def","DEF") =REPLACE(A4, FIND("def",A4), LEN("def"),"DEF") |
abcDEFghij |
Replace only the first one | sub("def", "DEF", t) | str_replace(t, "def", "DEF") | abcDEFghij abcdefghij |
|||
Replace all | t.replace('def', 'DEF') | gsub("def", "DEF", t) | str_replace_all(t, "def", "DEF") | Replace(t, "def", "DEF") | =SUBSTITUTE( A5,"def","DEF") |
abcDEFghij abcDEFghij |
Python | R | R(stringr) | VBA | EXCEL | result | |
---|---|---|---|---|---|---|
Uppercase | u.upper() | toupper(u) | str_to_upper(u) | UCase(u) | =UPPER(A6) | ABCDEFGHIJ |
To lowercase | u.lower() | tolower(u) | str_to_lower(u) | LCase(u) | =LOWER(A6) | abcdefghij |
Uppercase only at the beginning, lowercase otherwise | u.capitalize() | str_to_title(u) str_to_sentence(u) |
StrConv(u, vbProperCase) | =PROPER(A6) | Abcdefghij | |
Swap uppercase and lowercase | u.swapcase() | chartr("A-Za-z", "a-zA-z", u) | ABCdefGHIJ | |||
Judgment of capital letters | u.isupper() | u == toupper(u) | u == str_to_upper(u) | False | ||
Judgment of lowercase letters | u.islower() | u == tolower(u) | u == str_to_lower(u) | False | ||
Full-width | chartr("A-Za-z", "A-Za-z", u) | StrConv(u, vbWide) | =JIS(A6) | abcDEFghij | ||
Half-width | chartr("A-Za-z", "A-Za-z", v) | StrConv(v, vbNarrow) | =ASC(A7) | abcDEFghij |
Python | R | R(stringr) | VBA | EXCEL | result | |
---|---|---|---|---|---|---|
space | ' ' * 3 | str_dup(" ", 3) | Space(3) | =REPT(" ",3) | " " | |
Remove spaces on both sides | w.strip(' ') | str_trim(s, side="both") | Trim(w) | =TRIM(A8) | "d e f" | |
Delete left space | w.lstrip(' ') | str_trim(s, side="left") | LTrim(w) | "d e f " | ||
Delete right space | w.rstrip(' ') | str_trim(s, side="right") | RTrim(w) | " d e f" |
Note) EXCEL's TRIM function is deleted to become d e f
except for one space in the character string.
The whole program used for reference is shown. See Last article for Python and VBA code.
R(stringr)
R
library(stringr)
#String concatenation
s1 <- "abc"
s2 <- "def"
s3 <- "ghij"
str_c(s1, s2, s3)
# "abcdefghij"
#String length
s <- "abcdefghij"
str_length(s)
# 10
#Extract string
s <- "abcdefghij"
str_sub(s, 1, 2)
# "ab"
str_sub(s, -2, -1)
# "ij"
str_sub(s, 4, 6)
# "def"
#Search for strings
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_detect(s, "def")
# TRUE
str_detect(t, "def")
# TRUE
str_count(s, "def")
# 1
str_count(t, "def")
# 2
str_locate(s, "def")
# start end
# [1,] 4 6
str_locate(t, "def")
# start end
# [1,] 4 6
class(str_locate(t, "def"))
# "matrix"
str_locate_all(t, "def")
# [[1]]
# start end
# [1,] 4 6
# [2,] 14 16
class(str_locate_all(t, "def"))
# "list"
#String replacement
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_replace(s, "def", "DEF")
# "abcDEFghij"
str_replace(t, "def", "DEF")
# "abcDEFghijabcdefghij"
str_replace_all(t, "def", "DEF")
# "abcDEFghijabcDEFghij"
#Converting case of character string
s <- "abcDEFghij"
str_to_upper(s) #Uppercase
# "ABCDEFGHIJ"
str_to_lower(s) #To lowercase
# "abcdefghij"
str_to_title(s) #Uppercase only at the beginning, lowercase otherwise
# "abcdefghij"
str_to_sentence(s) #Uppercase only at the beginning, lowercase otherwise
# "Abcdefghij"
ss <- "abc def ghij"
str_to_title(ss)
# "Abc Def Ghij"
str_to_sentence(ss)
# "Abc def ghij"
t <- ""
for (i in 1:str_length(s)) {
stemp = str_sub(s,i,i)
if (stemp == str_to_lower(stemp)) {
stemp = str_to_upper(stemp)
} else if (stemp == str_to_upper(stemp)) {
stemp = str_to_lower(stemp)
}
t <- str_c(t, stemp)
}
t #Swapping uppercase and lowercase letters
# "ABCdefGHIJ"
s == str_to_upper(s) #Judgment of all uppercase letters
# FALSE
s == str_to_lower(s) #Judgment of all lowercase letters
# FALSE
#Inversion of string
s <- "abcdefghij"
t <- ""
for (i in 1:str_length(s)) {
t <- str_c(t, str_sub(s, -i, -i))
}
t
# "jihgfedcba"
#Repeat string
str_dup("A", 3)
# "AAA"
str_dup("def", 3)
# "defdefdef"
#String space
str_c("-", str_dup(" ", 3), "-")
# "- -"
# "- -"
s <- str_c(str_dup(" ", 2), "d",
str_dup(" ", 3), "e",
str_dup(" ", 4), "f",
str_dup(" ", 5))
str_c("-", s, "-")
# "- d e f -"
#Remove spaces before and after the string
str_trim(s, side="left")
# "d e f "
str_trim(s, side="right")
# " d e f"
str_trim(s, side="both")
# "d e f"
#String vector
s1 <- "abcdefghij"
s2 <- "cdefghijkl"
s3 <- "efghijklmn"
ss <- c(s1, s2, s3)
ss
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_c(ss, "_1")
# [1] "abcdefghij_1" "cdefghijkl_1" "efghijklmn_1"
str_length(ss)
# [1] 10 10 10
str_sub(ss, 1, 2)
# [1] "ab" "cd" "ef"
str_sub(ss, -2, -1)
# [1] "ij" "kl" "mn"
str_sub(ss, 2, 3)
# [1] "bc" "de" "fg"
str_detect(ss, "def")
# [1] TRUE TRUE FALSE
str_count(ss, "def")
# [1] 1 1 0
str_locate(ss, "def")
# start end
# [1,] 4 6
# [2,] 2 4
# [3,] NA NA
str_locate_all(ss, "def")
# [[1]]
# start end
# [1,] 4 6
#
# [[2]]
# start end
# [1,] 2 4
#
# [[3]]
# start end
#
str_replace(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_replace_all(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_to_upper(ss)
# [1] "ABCDEFGHIJ" "CDEFGHIJKL" "EFGHIJKLMN"
str_to_lower(ss)
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_to_title(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
str_to_sentence(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
ss == str_to_upper(ss)
# [1] FALSE FALSE FALSE
ss == str_to_lower(ss)
# [1] TRUE TRUE TRUE
str_dup(ss, 2)
# [1] "abcdefghijabcdefghij" "cdefghijklcdefghijkl" "efghijklmnefghijklmn"
tt <- str_c(" ", ss, " _1 ")
tt
# [1] " abcdefghij _1 " " cdefghijkl _1 " " efghijklmn _1 "
str_trim(tt)
# [1] "abcdefghij _1" "cdefghijkl _1" "efghijklmn _1"
str_trim(tt, side="left")
# [1] "abcdefghij _1 " "cdefghijkl _1 " "efghijklmn _1 "
str_trim(tt, side="right")
# [1] " abcdefghij _1" " cdefghijkl _1" " efghijklmn _1"
Recommended Posts