wc popular-names.txt
# 2780 11120 55026 popular-names.txt
wc -l popular-names.txt
# 2780 popular-names.txt
# Differences between sed on Mac OSX and other “standard” sed? https://unix.stackexchange.com/questions/13711/differences-between-sed-on-mac-osx-and-other-standard-sed
# GNU sed interprets escape sequences like \t, \n, \001, \x01, \w, and \b.
# OS X's sed and POSIX sed only interpret \n (but not in the replacement part of s).
# We should install the GNU sed: https://medium.com/@bramblexu/install-gnu-sed-on-mac-os-and-set-it-as-default-7c17ef1b8f64
# > is used to overwrite (“clobber”) a file and >> is used to append to a file.
# sed method 1
sed -E 's/\t/ /g' popular-names.txt > popular-names-space.txt
# sed method 2
sed -E 's/[[:space:]]/ /g' popular-names.txt > popular-names-space.txt
# tr method
tr -s '\t' ' ' < popular-names.txt > popular-names-space.txt
# expand method
expand -t 1 popular-names.txt > popular-names-space.txt
cut -d $'\t' -f1 popular-names.txt > col1.txt
cut -d $'\t' -f2 popular-names.txt > col2.txt
cut -d ' ' -f1 popular-names-space.txt > col1.txt
cut -d ' ' -f2 popular-names-space.txt > col2.txt
paste col1.txt col2.txt > merge_test.txt
head -n 10 merge_test.txt
tail -n 10 merge_test.txt
# This solution is based on mac os, which not support -d -n option.
# The GNU split support -d -n option.
# -l: line_count. Create files that are smaller than 500 lines in length.
# -a: suffix_length. The splited file cannot have names like split01.txt, split02.txt. Otherwise, we have to use the GNU split.
# split- prefix: name
split -l 500 -a 1 popular-names.txt split-
# split-a
# split-b
# split-c
cut -d $'\t' -f1 popular-names.txt | sort | uniq > unique_names.txt
sort -k 3 -n -r popular-names.txt > popular-names-sorted.txt
# -k 3: sort as the 3rd column
# -n: numeric sort
# -r: reverse order
cut -d $'\t' -f1 popular-names.txt | sort | uniq -c | sort -k1nr > name_frequency.txt
# cut -d $'\t' -f1 popular-names.txt | sort | uniq -c
# 17 Abigail
# 3 Aiden
# 8 Alexander
# 8 Alexis
# uniq -c
# -c: output the count of unique names in the 1st column
# sort -k1nr
# -k1: sort by first column
# n: numeric sort
# r: descending order
Recommended Posts