I parse xml every day at http://manga-now.com, but I compared how it would be faster if I changed the Python implementation to Go. Since I want to compare only the speed of parsing, I measure the speed from the state where xml is read into the memory until the acquisition of each element is completed.

Download xml

First, use the Amazon Product Advertising API to drop the xml of the book information and save it in a file.

get_books_xml.go

$ mkdir xmls
$ go run get_books_xml.go

If you change AccessKey, SecretKey, and AssociateTag to appropriate ones and execute it, 145 files will be saved in the xmls directory. One file contains information for up to 10 books, for a total of 1442 books.

Run in python

parse_amazon_xml.py

`parse_amazon_xml.py`


# -*- coding:utf-8 -*-
import time
from lxml import objectify


class ImageInfo:
	def __init__(self):
		self.url = ''
		self.width = ''
		self.height = ''

class BookInfo:
	def __init__(self):
		self.asin = ''
		self.title = ''
		self.binding = ''
		self.author = ''
		self.publisher = ''
		self.publicationDate = ''
		self.images = {}


def getText(dom, tag):
	return getattr(dom, tag).text if tag in dom else ''


def parseXmls(xmls):
	bookInfos = []
	for xml in xmls:
		dom = objectify.fromstring(xml)
		for item in dom.Items.Item:
			bookInfo = BookInfo()
			bookInfo.asin = item.ASIN.text

			attr = item.ItemAttributes
			bookInfo.title = getText(attr, 'Title')
			bookInfo.binding = getText(attr, 'Binding')
			bookInfo.author = getText(attr, 'Author')
			bookInfo.publisher = getText(attr, 'Publisher')
			bookInfo.publicationDate = getText(attr, 'PublicationDate')

			imageLabels = ['SmallImage', 'MediumImage', 'LargeImage']
			for imageLabel in imageLabels:
				image = ImageInfo()
				if imageLabel in item:
					image.url = getattr(item, imageLabel).URL.text
					image.width = int(getattr(item, imageLabel).Width.text)
					image.height = int(getattr(item, imageLabel).Height.text)
				bookInfo.images[imageLabel] = image

			bookInfos.append(bookInfo)

	return bookInfos


def getXmls():
	xmls = []
	for i in range(0, 1440+1, 10):
		path = 'xmls/{}.xml'.format(i)
		with open(path, 'r') as f:
			xml = f.read()
			xmls.append(xml)
	return xmls


def main():
	xmls = getXmls()
	start = time.time()
	bookInfos = parseXmls(xmls)
	end = time.time()
	print('Number of xml: {}'.format(len(xmls)))
	print('number of books: {}'.format(len(bookInfos)))
	print('parse time: {}Seconds'.format(end - start))


if __name__ == '__main__':
    main()

$ python parse_amazon_xml.py
Number of xml: 145
number of books: 1442
parse time: 0.14079904556274414 seconds

It was 0.140 seconds. I am using the lxml module for parsing.

Run in Go

parse_amazon_xml.go

`parse_amazon_xml.go`


package main

import (
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"io/ioutil"
	"strconv"
	"strings"
	"time"
)

type ImageInfo struct {
	url    string
	width  int
	height int
}

type BookInfo struct {
	asin            string
	title           string
	binding         string
	author          string
	publisher       string
	publicationDate string
	images          map[string]ImageInfo
}

func parseXmls(xmls []string) []BookInfo {
	bookInfos := []BookInfo{}
	for _, xml := range xmls {
		dom, _ := goquery.NewDocumentFromReader(strings.NewReader(xml))
		dom.Find("Item").Each(func(_ int, item *goquery.Selection) {
			bookInfo := BookInfo{}
			bookInfo.asin = item.Find("ASIN").Text()
			attributes := item.Find("ItemAttributes").First()
			if attributes.Length() > 0 {
				bookInfo.title = attributes.Find("Title").Text()
				bookInfo.binding = attributes.Find("Binding").Text()
				bookInfo.author = attributes.Find("Author").Text()
				bookInfo.publisher = attributes.Find("Publisher").Text()
				bookInfo.publicationDate = attributes.Find("PublicationDate").Text()
			}
			imageLabels := []string{
				"SmallImage",
				"MediumImage",
				"LargeImage",
			}
			images := map[string]ImageInfo{}
			for _, imageLabel := range imageLabels {
				xml := item.Find(imageLabel).First()
				url := xml.Find("URL").Text()
				width, _ := strconv.Atoi(xml.Find("Height").Text())
				height, _ := strconv.Atoi(xml.Find("Width").Text())
				image := ImageInfo{url, width, height}
				images[imageLabel] = image
			}
			bookInfo.images = images
			bookInfos = append(bookInfos, bookInfo)
		})
	}
	return bookInfos
}

func getXmls() []string {
	xmls := []string{}
	for i := 0; i <= 1440; i += 10 {
		path := fmt.Sprintf("xmls/%d.xml", i)
		xml, _ := ioutil.ReadFile(path)
		xmls = append(xmls, string(xml))
	}
	return xmls
}

func main() {
	xmls := getXmls()
	start := time.Now()
	bookInfos := parseXmls(xmls)
	end := time.Now()
	fmt.Printf("Number of xml: %d\n", len(xmls))
	fmt.Printf("number of books: %d\n", len(bookInfos))
	fmt.Printf("parse time: %f seconds\n", (end.Sub(start)).Seconds())
}

$ go run parse_amazon_xml.go
Number of xml: 145
number of books: 1442
parse time: 0.180461 seconds

0.18 seconds. It's slower than Python. I'm using goquery for parsing.

Parallel execution in Go

Go is slower with single thread, but Go can easily execute in parallel, so let's compare this as well. The CPU running is 2 cores and 4 threads. Write only the code changes.

parse_amazon_xml_th.go

`parse_amazon_xml_th.go`


//Take a channel as an argument
//Delete the return value
func parseXmls(result chan []BookInfo, xmls []string) {
	...Omitted because it is the same
	//Return the processing result to the channel (replaced return)
	result <- bookInfos
}

//Split xml array into num
func divideXmls(xmls []string, num int) [][]string {
	xmlsNum := len(xmls)
	size := xmlsNum / num
	result := [][]string{}
	for i := 0; i < num; i++ {
		start := size * i
		end := size * (i + 1)
		if i == (num - 1) {
			end = xmlsNum
		}
		result = append(result, xmls[start:end])
	}
	return result
}

func main() {
	allXmls := getXmls()
	//Split xml into 4
	divXmls := divideXmls(allXmls, 4)
	start := time.Now()

	result := make(chan []BookInfo)
	//Execute in 4 threads
	for _, xmls := range divXmls {
		go parseXmls(result, xmls)
	}
	//Receive processing results from channels and combine them into one
	bookInfos := []BookInfo{}
	for _, _ = range divXmls {
		bookInfos = append(bookInfos, <-result...)
	}

	end := time.Now()
	fmt.Printf("Number of xml: %d\n", len(allXmls))
	fmt.Printf("number of books: %d\n", len(bookInfos))
	fmt.Printf("parse time: %f seconds\n", (end.Sub(start)).Seconds())
}

$ go run parse_amazon_xml_th.go
Number of xml: 145
number of books: 1442
parse time: 0.084918 seconds

0.084 seconds. It has doubled.

Summary

Implementation	speed
Python (lxml)	0.140 seconds
Go (goquery)1 thread	0.180 seconds
Go (goquery)4 threads	0.084 seconds

Go only by executing in parallel (There is no merit of Go unless it is executed in parallel)

Compare xml parsing speeds with Python and Go