I wanted to summarize the preg_xxx
and regular expressions that are used relatively when developing with PHP as a memorandum, so I will describe it here.
In the end, all of them are changed and used as needed, but they are used as a reasonable base.
It also describes whether similar results can be obtained with Python3.
This still needs to be scrutinized, but as an example of a regular expression. It is changed each time depending on the input case and usage.
/\A(\d{4})*[-\/]*(\d{1,2})[-\/]*(\d{1,2}) *((\d{1,2}):(\d{1,2})(:(\d{1,2}))*)*\Z/
Notation | meaning |
---|---|
/ | delimiter |
\A | The beginning of the string |
() | Treat as a set |
\d{4} | 4-digit number |
* | Repeat 0 or more times of the previous pattern |
[] | Character set,[]One of the characters in |
\d{1,2} | Numerical value of 1 to 2 digits |
\Z | The end of the string |
<?php
function pick_date(string $date) :array {
if (preg_match('/\A(\d{4})*[-\/]*(\d{1,2})[-\/]*(\d{1,2}) *((\d{1,2}):(\d{1,2})(:(\d{1,2}))*)*\Z/', $date, $matches)) {
return [
'Y' => isset($matches[1]) ?intval($matches[1]) : -1,
'm' => isset($matches[2]) ?intval($matches[2]) : -1,
'd' => isset($matches[3]) ?intval($matches[3]) : -1,
'H' => isset($matches[5]) ?intval($matches[5]) : -1,
'i' => isset($matches[6]) ?intval($matches[6]) : -1,
's' => isset($matches[8]) ?intval($matches[8]) : -1
];
} else {
return [];
}
}
print_r(pick_date('2017-07-03 13:15:03'));
print_r(pick_date('2017-07-3 13:01'));
print_r(pick_date('2017/07/03 13'));
print_r(pick_date('2017/07-3 13:1:3'));
print_r(pick_date('201773 13:00'));
Execution result
Array
(
[Y] => 2017
[m] => 7
[d] => 3
[H] => 13
[i] => 15
[s] => 3
)
Array
(
[Y] => 2017
[m] => 7
[d] => 3
[H] => 13
[i] => 1
[s] => -1
)
Array
(
)
Array
(
[Y] => 2017
[m] => 7
[d] => 3
[H] => 13
[i] => 1
[s] => 3
)
Array
(
[Y] => 2017
[m] => 7
[d] => 3
[H] => 13
[i] => 0
[s] => -1
)
Unlike PHP, no delimiter is required.
Also, if you are not aware of the order, you can use dict
instead of ʻOrderedDict. You can get the same output result as PHP with
findall`.
import re
from collections import OrderedDict
def pick_date(date):
pattern = r'\A(\d{4})*[-\/]*(\d{1,2})[-\/]*(\d{1,2}) *((\d{1,2}):(\d{1,2})(:(\d{1,2}))*)*\Z'
match = re.findall(pattern, date)
try:
elements = match[0]
return OrderedDict((
('Y', elements[0]),
('m', elements[1]),
('d', elements[2]),
('H', elements[4]),
('i', elements[5]),
('s', elements[7])
))
except IndexError:
return OrderedDict()
print(pick_date('2017-07-03 13:15:03'))
print(pick_date('2017-07-3 13:01'))
print(pick_date('2017/07/03 13'))
print(pick_date('2017/07-3 13:1:3'))
print(pick_date('201773 13:00'))
Output result
OrderedDict([('Y', '2017'), ('m', '07'), ('d', '03'), ('H', '13'), ('i', '15'), ('s', '03')])
OrderedDict([('Y', '2017'), ('m', '07'), ('d', '3'), ('H', '13'), ('i', '01'), ('s', '')])
OrderedDict()
OrderedDict([('Y', '2017'), ('m', '07'), ('d', '3'), ('H', '13'), ('i', '1'), ('s', '3')])
OrderedDict([('Y', '2017'), ('m', '7'), ('d', '3'), ('H', '13'), ('i', '00'), ('s', '')])
In this case, depending on the scenario, the following method may be better without using regular expressions. Will it be here if it is strict?
<?php
date_default_timezone_set('Asia/Tokyo');
function pick_date(string $date) : array {
$dt = new DateTime();
return [
'y' => $dt->setTimestamp(strtotime($date))->format('Y'),
'm' => $dt->format('m'),
'd' => $dt->format('d'),
'H' => $dt->format('H'),
'i' => $dt->format('i'),
's' => $dt->format('s'),
];
}
print_r(pick_date('2017-07-03 13:15:03'));
print_r(pick_date('2017-07-3 13:01'));
print_r(pick_date('2017/07/03 13'));
print_r(pick_date('2017/07-3 13:1:3'));
print_r(pick_date('201773 13:00'));
Output result
Array
(
[y] => 2017
[m] => 07
[d] => 03
[H] => 13
[i] => 15
[s] => 03
)
Array
(
[y] => 2017
[m] => 07
[d] => 03
[H] => 13
[i] => 01
[s] => 00
)
Array
(
[y] => 1970
[m] => 01
[d] => 01
[H] => 09
[i] => 00
[s] => 00
)
Array
(
[y] => 1970
[m] => 01
[d] => 01
[H] => 09
[i] => 00
[s] => 00
)
Array
(
[y] => 1970
[m] => 01
[d] => 01
[H] => 09
[i] => 00
[s] => 00
)
I thought about how to convert once with strptime
of datetime
, but unlike PHP's strtotime
, it is necessary to specify format
, so I can not do something similar.
Validate whether it conforms to the email format. See here.
/\A^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,})$\Z/
Notation | meaning |
---|---|
/ | delimiter |
\A | The beginning of the string |
[1] | []Starts with one of the characters (character set) in |
a-z | Letters from a to z |
() | Treat as a set |
+ | Repeat one or more of the previous pattern |
\. | Dot(".") Escape to determine |
* | Repeat 0 or more times of the previous pattern |
[a-z]{2,} | Two or more lowercase letters |
\Z | The end of the string |
<?php
function validate_email_format(string $email) : int {
return preg_match('/\A^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,})$\Z/', $email);
}
print(validate_email_format('[email protected]'). PHP_EOL);
print(validate_email_format('[email protected]'). PHP_EOL);
print(validate_email_format('[email protected]'). PHP_EOL);
print(validate_email_format('[email protected]'). PHP_EOL);
print(validate_email_format('test@testcom'). PHP_EOL);
Output result
1
0
0
0
0
import re
def validate_email_format(email):
pattern = r'\A^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,})$\Z'
return 1 if re.match(pattern, email) else 0
print(validate_email_format('[email protected]'))
print(validate_email_format('[email protected]'))
print(validate_email_format('[email protected]'))
print(validate_email_format('[email protected]'))
print(validate_email_format('test@testcom'))
Output result
1
0
0
0
0
Validate whether it conforms to the URL format (in this case, also consider ftp
). I referred to here.
/^(https?|ftp):\/\/([A-Z0-9][A-Z0-9_-]*(?:\.[A-Z0-9][A-Z0-9_-]*)+):?([0-9]+)?\/?/i
Notation | meaning |
---|---|
/ | delimiter |
() | Expression set, |
^() | Starts with a set of expressions |
? | With or without the previous pattern |
| | Opinion of OR |
[] | Character set,[]One of the characters in |
(?:) | Set not to capture |
+ | Repeat one or more of the previous pattern |
* | Repeat 0 or more times of the previous pattern |
i | Case insensitive |
<?php
function pick_url(string $url) : array {
if (preg_match('/^(https?|ftp):\/\/([A-Z0-9][A-Z0-9_-]*(?:\.[A-Z0-9][A-Z0-9_-]*)+):?([0-9]+)?\/?/i', $url, $matches)) {
return [
$matches[0] ?? "",
$matches[1] ?? ""
];
} else {
return ["", ""];
}
}
print_r(pick_url('http://test.xxx?a=b'));
print_r(pick_url('https://test.xxx/a/b/'));
print_r(pick_url('ftp://test.xxx'));
print_r(pick_url('ftps://test.xxx'));
print_r(pick_url('https:///test.xxx'));
Output result
Array
(
[0] => http://test.xxx
[1] => http
)
Array
(
[0] => https://test.xxx/
[1] => https
)
Array
(
[0] => ftp://test.xxx
[1] => ftp
)
Array
(
[0] =>
[1] =>
)
Array
(
[0] =>
[1] =>
)
import re
def pick_url(url):
pattern = r'^(https?|ftp):\/\/([A-Z0-9][A-Z0-9_-]*(?:\.[A-Z0-9][A-Z0-9_-]*)+):?([0-9]+)?\/?'
match = re.compile(pattern, re.IGNORECASE).findall(url)
try:
elements = match[0]
return [elements[0], elements[1]]
except IndexError:
return ["", ""]
print(pick_url('http://test.xxx?a=b'))
print(pick_url('https://test.xxx/a/b/'))
print(pick_url('ftp://test.xxx'))
print(pick_url('ftps://test.xxx'))
print(pick_url('https:///test.xxx'))
Output result
['http', 'test.xxx']
['https', 'test.xxx']
['ftp', 'test.xxx']
['', '']
['', '']
In this example, a sample that replaces the word'abc' with'ABC'. At that time, I had a difficult time with the method of rewriting word by word, so I made a note. In fact, this is also not strict, so I change it from case to case.
/(\b)(abc)(\b)/
Notation | meaning |
---|---|
/ | delimiter |
() | Expression set |
\b | Matches the characters that border English words |
<?php
function replace_abc(string $s) : string {
return preg_replace('/(\b)(abc)(\b)/', '${1}ABC${3}', $s);
}
print(replace_abc('abcd abc" dabcd abc'). "\n");
print(replace_abc('abc dabc abc d "abc"'). "\n");
print(replace_abc('abcd +abc" abc abc?'). "\n");
print(replace_abc('a!abc \'abc\' sabcs abc\'s!'). "\n");
print(replace_abc('ababc? \'abc?\' sabcs abc!?'). "\n");
Output result
abcd ABC" dabcd ABC
ABC dabc ABC d "ABC"
abcd +ABC" ABC ABC?
a!ABC 'ABC' sabcs ABC's!
ababc? 'ABC?' sabcs ABC!?
$ {N}
in '$ {1} ABC $ {3}'
means the nth matching character.No delimiter is required like PHP. Also, the format specification method after replacement is described as \ 1
instead of$ {1}
.
import re
def replace_abc(s):
return re.sub(r'(\b)(abc)(\b)', r'\1ABC\3', s)
print(replace_abc('abcd abc" dabcd abc'))
print(replace_abc('abc dabc abc d "abc"'))
print(replace_abc('abcd +abc" abc abc?'))
print(replace_abc('a!abc \'abc\' sabcs abc\'s!'))
print(replace_abc('ababc? \'abc?\' sabcs abc!?'))
Output result
abcd ABC" dabcd ABC
ABC dabc ABC d "ABC"
abcd +ABC" ABC ABC?
a!ABC 'ABC' sabcs ABC's!
ababc? 'ABC?' sabcs ABC!?
It is assumed that you will get a csv column containing double quotes as shown below.
test.csv
a,b,c
"a,b","b,c","c,d"
"a,b,\"a,b\",,c","a,,b,,",c
"a,,","a,b,\"a,b,c\",c,d,","a\"a,b\",c"
Expected result (()Is described for easy identification)
(a), (b), (c)
(a,b), (b,c), (c,d)
(a,b,\"a,b\",,c), (a,,b,,), (c)
(a,,), (a,b,\"a,b,c\",c,d,), (a\"a,b\",c)
It doesn't work if there is a double quote inside the double quote. There may be a way, but here we are using regular expressions. In this method, is it an image that is extracted with preg_split ('/, (?!") /', $ Columns);
?
When I looked into these matters, there were many ways to extract them by programming. Let's realize it with a regular expression.
csv.php
<?php
$file = fopen("test.csv", "r");
if ($file) {
while (($columns = fgetcsv($file, 0, ',', '"', '"')) !== FALSE) {
print_r($columns);
}
}
fclose($file);
Output result
Array
(
[0] => a
[1] => b
[2] => c
)
Array
(
[0] => "a
[1] => b","b
[2] => c","c
[3] => d"
)
Array
(
[0] => "a
[1] => b
[2] => \"a
[3] => b\"
[4] =>
[5] => c","a
[6] =>
[7] => b
[8] => ,"
[9] => c
)
Array
(
[0] => "a
[1] => ,","a
[2] => b
[3] => \"a
[4] => b
[5] => c\"
[6] => c
[7] => d,","a\"a
[8] => b\"
[9] => c"
)
It seems to be quite accurate. The example test.csv
seems a bit too aggressive, but I can't say anything because I actually encountered a similar csv.
import csv
with open('test.csv') as f:
r = csv.reader(f)
for column in r:
print(column)
Output result
['a', 'b', 'c']
['a,b', 'b,c', 'c,d']
['a,b,\\a', 'b\\"', '', 'c"', 'a,,b,,', 'c']
['a,,', 'a,b,\\a', 'b', 'c\\"', 'c', 'd', ',a\\"a', 'b\\"', 'c"']
/(?:\n|\r|\r\n)/
Notation | meaning |
---|---|
/ | delimiter |
() | Expression set, |
(?:) | Set not to capture |
| | Opinion of OR |
[] | Character set,[]One of the characters in |
+ | Repeat one or more of the previous pattern |
(?:)
about,$str = preg_replace('/(?:\n|\r|\r\n)/', '', $str);
It can also be used when removing line feed codes such as<?php
function my_generator(string $name) : Iterator {
$from = function () use ($name) {
$file = fopen($name, "r");
if ($file) {
while ($line = fgets($file)) {
yield $line;
}
}
fclose($file);
};
yield from $from();
}
$pattern = '/\"(?:\\\"|[^\"])+\"/';
$bks = [];
foreach (my_generator("test.csv") as $v) {
// "Store the value enclosed in and replace it with a unique id
$columns = preg_replace_callback($pattern, function ($matches) use (&$bk) {
$index = uniqid();
$bk[$index] = $matches[0];
return $index;
}, $v);
//Remove line feed code
$columns = preg_split('/,/', preg_replace("/\r|\n/", "", $columns));
//Restore the value replaced by id
$new_columns = array_map(function ($column) use ($bk) {
if (!empty($bk) && array_key_exists($column, $bk)) {
return $bk[$column];
} else {
return $column;
}
}, $columns);
print_r($new_columns);
}
Output result
Array
(
[0] => a
[1] => b
[2] => c
)
Array
(
[0] => "a,b"
[1] => "b,c"
[2] => "c,d"
)
Array
(
[0] => "a,b,\"a,b\",,c"
[1] => "a,,b,,"
[2] => c
)
Array
(
[0] => "a,,"
[1] => "a,b,\"a,b,c\",c,d,"
[2] => "a\"a,b\",c"
)
import re
import uuid
bks = {}
def my_generator():
with open('test.csv') as lines:
yield from lines
def repl(m):
index = str(uuid.uuid4())
bks[index] = m.group(0)
return index
pattern = r'\"(?:\\\"|[^\"])+\"'
for k, v in enumerate(my_generator()):
columns = re.sub(pattern, repl, v).rstrip('\r\n').split(",")
new_columns = []
for c in columns:
if c in bks:
new_columns.append(bks[c])
else:
new_columns.append(c)
print(new_columns)
Output result
['a', 'b', 'c']
['"a,b"', '"b,c"', '"c,d"']
['"a,b,\\"a,b\\",,c"', '"a,,b,,"', 'c']
['"a,,"', '"a,b,\\"a,b,c\\",c,d,"', '"a\\"a,b\\",c"']
It seems that there are cases where the double quotes in the double quotes are not escaped.
I would like to try a code that can handle notations such as " "a, b", c "," a, b, "a", "b" ", c
.
Recommended Posts