I parse xml every day at http://manga-now.com, but I compared how it would be faster if I changed the Python implementation to Go. Since I want to compare only the speed of parsing, I measure the speed from the state where xml is read into the memory until the acquisition of each element is completed.
First, use the Amazon Product Advertising API to drop the xml of the book information and save it in a file.
$ mkdir xmls
$ go run get_books_xml.go
If you change AccessKey, SecretKey, and AssociateTag to appropriate ones and execute it, 145 files will be saved in the xmls directory. One file contains information for up to 10 books, for a total of 1442 books.
parse_amazon_xml.py
# -*- coding:utf-8 -*-
import time
from lxml import objectify
class ImageInfo:
def __init__(self):
self.url = ''
self.width = ''
self.height = ''
class BookInfo:
def __init__(self):
self.asin = ''
self.title = ''
self.binding = ''
self.author = ''
self.publisher = ''
self.publicationDate = ''
self.images = {}
def getText(dom, tag):
return getattr(dom, tag).text if tag in dom else ''
def parseXmls(xmls):
bookInfos = []
for xml in xmls:
dom = objectify.fromstring(xml)
for item in dom.Items.Item:
bookInfo = BookInfo()
bookInfo.asin = item.ASIN.text
attr = item.ItemAttributes
bookInfo.title = getText(attr, 'Title')
bookInfo.binding = getText(attr, 'Binding')
bookInfo.author = getText(attr, 'Author')
bookInfo.publisher = getText(attr, 'Publisher')
bookInfo.publicationDate = getText(attr, 'PublicationDate')
imageLabels = ['SmallImage', 'MediumImage', 'LargeImage']
for imageLabel in imageLabels:
image = ImageInfo()
if imageLabel in item:
image.url = getattr(item, imageLabel).URL.text
image.width = int(getattr(item, imageLabel).Width.text)
image.height = int(getattr(item, imageLabel).Height.text)
bookInfo.images[imageLabel] = image
bookInfos.append(bookInfo)
return bookInfos
def getXmls():
xmls = []
for i in range(0, 1440+1, 10):
path = 'xmls/{}.xml'.format(i)
with open(path, 'r') as f:
xml = f.read()
xmls.append(xml)
return xmls
def main():
xmls = getXmls()
start = time.time()
bookInfos = parseXmls(xmls)
end = time.time()
print('Number of xml: {}'.format(len(xmls)))
print('number of books: {}'.format(len(bookInfos)))
print('parse time: {}Seconds'.format(end - start))
if __name__ == '__main__':
main()
$ python parse_amazon_xml.py
Number of xml: 145
number of books: 1442
parse time: 0.14079904556274414 seconds
It was 0.140 seconds. I am using the lxml module for parsing.
parse_amazon_xml.go
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"io/ioutil"
"strconv"
"strings"
"time"
)
type ImageInfo struct {
url string
width int
height int
}
type BookInfo struct {
asin string
title string
binding string
author string
publisher string
publicationDate string
images map[string]ImageInfo
}
func parseXmls(xmls []string) []BookInfo {
bookInfos := []BookInfo{}
for _, xml := range xmls {
dom, _ := goquery.NewDocumentFromReader(strings.NewReader(xml))
dom.Find("Item").Each(func(_ int, item *goquery.Selection) {
bookInfo := BookInfo{}
bookInfo.asin = item.Find("ASIN").Text()
attributes := item.Find("ItemAttributes").First()
if attributes.Length() > 0 {
bookInfo.title = attributes.Find("Title").Text()
bookInfo.binding = attributes.Find("Binding").Text()
bookInfo.author = attributes.Find("Author").Text()
bookInfo.publisher = attributes.Find("Publisher").Text()
bookInfo.publicationDate = attributes.Find("PublicationDate").Text()
}
imageLabels := []string{
"SmallImage",
"MediumImage",
"LargeImage",
}
images := map[string]ImageInfo{}
for _, imageLabel := range imageLabels {
xml := item.Find(imageLabel).First()
url := xml.Find("URL").Text()
width, _ := strconv.Atoi(xml.Find("Height").Text())
height, _ := strconv.Atoi(xml.Find("Width").Text())
image := ImageInfo{url, width, height}
images[imageLabel] = image
}
bookInfo.images = images
bookInfos = append(bookInfos, bookInfo)
})
}
return bookInfos
}
func getXmls() []string {
xmls := []string{}
for i := 0; i <= 1440; i += 10 {
path := fmt.Sprintf("xmls/%d.xml", i)
xml, _ := ioutil.ReadFile(path)
xmls = append(xmls, string(xml))
}
return xmls
}
func main() {
xmls := getXmls()
start := time.Now()
bookInfos := parseXmls(xmls)
end := time.Now()
fmt.Printf("Number of xml: %d\n", len(xmls))
fmt.Printf("number of books: %d\n", len(bookInfos))
fmt.Printf("parse time: %f seconds\n", (end.Sub(start)).Seconds())
}
$ go run parse_amazon_xml.go
Number of xml: 145
number of books: 1442
parse time: 0.180461 seconds
0.18 seconds. It's slower than Python. I'm using goquery for parsing.
Go is slower with single thread, but Go can easily execute in parallel, so let's compare this as well. The CPU running is 2 cores and 4 threads. Write only the code changes.
parse_amazon_xml_th.go
//Take a channel as an argument
//Delete the return value
func parseXmls(result chan []BookInfo, xmls []string) {
...Omitted because it is the same
//Return the processing result to the channel (replaced return)
result <- bookInfos
}
//Split xml array into num
func divideXmls(xmls []string, num int) [][]string {
xmlsNum := len(xmls)
size := xmlsNum / num
result := [][]string{}
for i := 0; i < num; i++ {
start := size * i
end := size * (i + 1)
if i == (num - 1) {
end = xmlsNum
}
result = append(result, xmls[start:end])
}
return result
}
func main() {
allXmls := getXmls()
//Split xml into 4
divXmls := divideXmls(allXmls, 4)
start := time.Now()
result := make(chan []BookInfo)
//Execute in 4 threads
for _, xmls := range divXmls {
go parseXmls(result, xmls)
}
//Receive processing results from channels and combine them into one
bookInfos := []BookInfo{}
for _, _ = range divXmls {
bookInfos = append(bookInfos, <-result...)
}
end := time.Now()
fmt.Printf("Number of xml: %d\n", len(allXmls))
fmt.Printf("number of books: %d\n", len(bookInfos))
fmt.Printf("parse time: %f seconds\n", (end.Sub(start)).Seconds())
}
$ go run parse_amazon_xml_th.go
Number of xml: 145
number of books: 1442
parse time: 0.084918 seconds
0.084 seconds. It has doubled.
Implementation | speed |
---|---|
Python (lxml) | 0.140 seconds |
Go (goquery)1 thread | 0.180 seconds |
Go (goquery)4 threads | 0.084 seconds |
Go only by executing in parallel (There is no merit of Go unless it is executed in parallel)
Recommended Posts