If you use Python's email package, the eml file that can save emails is just the standard library. It's easy to analyze.
Get attachments, subject, body, etc. from eml file I made a class.
# coding:utf-8
"""
Get data based on eml file for easy handling
sample.
There may be omissions in consideration due to the minimum implementation. .. ..
"""
import sys
import email
from email.header import decode_header
class MailParser(object):
"""
A class that takes the path of a mail file and parses it
"""
def __init__(self, mail_file_path):
self.mail_file_path = mail_file_path
#email from eml file.message.Get a Message instance
with open(mail_file_path, 'rb') as email_file:
self.email_message = email.message_from_bytes(email_file.read())
self.subject = None
self.to_address = None
self.cc_address = None
self.from_address = None
self.body = ""
#Attachment related information
# {name: file_name, data: data}
self.attach_file_list = []
#Interpretation of eml
self._parse()
def get_attr_data(self):
"""
Get email data
"""
result = """\
FROM: {}
TO: {}
CC: {}
-----------------------
BODY:
{}
-----------------------
ATTACH_FILE_NAME:
{}
""".format(
self.from_address,
self.to_address,
self.cc_address,
self.body,
",".join([ x["name"] for x in self.attach_file_list])
)
return result
def _parse(self):
"""
Parsing mail files
__init__Calling in
"""
self.subject = self._get_decoded_header("Subject")
self.to_address = self._get_decoded_header("To")
self.cc_address = self._get_decoded_header("Cc")
self.from_address = self._get_decoded_header("From")
#Processing of message body part
for part in self.email_message.walk():
#If the ContentType is multipart, the actual content is even more
#Since it is in the inside part, skip it
if part.get_content_maintype() == 'multipart':
continue
#Get file name
attach_fname = part.get_filename()
#Should be the body if there is no file name
if not attach_fname:
charset = str(part.get_content_charset())
if charset:
self.body += part.get_payload(decode=True).decode(charset, errors="replace")
else:
self.body += part.get_payload(decode=True)
else:
#If there is a file name, it's an attachment
#Get the data
self.attach_file_list.append({
"name": attach_fname,
"data": part.get_payload(decode=True)
})
def _get_decoded_header(self, key_name):
"""
Get the decoded result from the header object
"""
ret = ""
#Keys that do not have the corresponding item return an empty string
raw_obj = self.email_message.get(key_name)
if raw_obj is None:
return ""
#Make the decoded result unicode
for fragment, encoding in decode_header(raw_obj):
if not hasattr(fragment, "decode"):
ret += fragment
continue
#UTF for the time being without encode-Decode with 8
if encoding:
ret += fragment.decode(encoding)
else:
ret += fragment.decode("UTF-8")
return ret
if __name__ == "__main__":
result = MailParser(sys.argv[1]).get_attr_data()
print(result)
For the time being, the expected results have been obtained. I hope it will be helpful in handling emails.
Recommended Posts