Compare xml parsing speeds with Python and Go

I parse xml every day at http://manga-now.com, but I compared how it would be faster if I changed the Python implementation to Go. Since I want to compare only the speed of parsing, I measure the speed from the state where xml is read into the memory until the acquisition of each element is completed.

Download xml

First, use the Amazon Product Advertising API to drop the xml of the book information and save it in a file.

get_books_xml.go

$ mkdir xmls
$ go run get_books_xml.go

If you change AccessKey, SecretKey, and AssociateTag to appropriate ones and execute it, 145 files will be saved in the xmls directory. One file contains information for up to 10 books, for a total of 1442 books.

Run in python

parse_amazon_xml.py

parse_amazon_xml.py


# -*- coding:utf-8 -*-
import time
from lxml import objectify


class ImageInfo:
	def __init__(self):
		self.url = ''
		self.width = ''
		self.height = ''

class BookInfo:
	def __init__(self):
		self.asin = ''
		self.title = ''
		self.binding = ''
		self.author = ''
		self.publisher = ''
		self.publicationDate = ''
		self.images = {}


def getText(dom, tag):
	return getattr(dom, tag).text if tag in dom else ''


def parseXmls(xmls):
	bookInfos = []
	for xml in xmls:
		dom = objectify.fromstring(xml)
		for item in dom.Items.Item:
			bookInfo = BookInfo()
			bookInfo.asin = item.ASIN.text

			attr = item.ItemAttributes
			bookInfo.title = getText(attr, 'Title')
			bookInfo.binding = getText(attr, 'Binding')
			bookInfo.author = getText(attr, 'Author')
			bookInfo.publisher = getText(attr, 'Publisher')
			bookInfo.publicationDate = getText(attr, 'PublicationDate')

			imageLabels = ['SmallImage', 'MediumImage', 'LargeImage']
			for imageLabel in imageLabels:
				image = ImageInfo()
				if imageLabel in item:
					image.url = getattr(item, imageLabel).URL.text
					image.width = int(getattr(item, imageLabel).Width.text)
					image.height = int(getattr(item, imageLabel).Height.text)
				bookInfo.images[imageLabel] = image

			bookInfos.append(bookInfo)

	return bookInfos


def getXmls():
	xmls = []
	for i in range(0, 1440+1, 10):
		path = 'xmls/{}.xml'.format(i)
		with open(path, 'r') as f:
			xml = f.read()
			xmls.append(xml)
	return xmls


def main():
	xmls = getXmls()
	start = time.time()
	bookInfos = parseXmls(xmls)
	end = time.time()
	print('Number of xml: {}'.format(len(xmls)))
	print('number of books: {}'.format(len(bookInfos)))
	print('parse time: {}Seconds'.format(end - start))


if __name__ == '__main__':
    main()
$ python parse_amazon_xml.py
Number of xml: 145
number of books: 1442
parse time: 0.14079904556274414 seconds

It was 0.140 seconds. I am using the lxml module for parsing.

Run in Go

parse_amazon_xml.go

parse_amazon_xml.go


package main

import (
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"io/ioutil"
	"strconv"
	"strings"
	"time"
)

type ImageInfo struct {
	url    string
	width  int
	height int
}

type BookInfo struct {
	asin            string
	title           string
	binding         string
	author          string
	publisher       string
	publicationDate string
	images          map[string]ImageInfo
}

func parseXmls(xmls []string) []BookInfo {
	bookInfos := []BookInfo{}
	for _, xml := range xmls {
		dom, _ := goquery.NewDocumentFromReader(strings.NewReader(xml))
		dom.Find("Item").Each(func(_ int, item *goquery.Selection) {
			bookInfo := BookInfo{}
			bookInfo.asin = item.Find("ASIN").Text()
			attributes := item.Find("ItemAttributes").First()
			if attributes.Length() > 0 {
				bookInfo.title = attributes.Find("Title").Text()
				bookInfo.binding = attributes.Find("Binding").Text()
				bookInfo.author = attributes.Find("Author").Text()
				bookInfo.publisher = attributes.Find("Publisher").Text()
				bookInfo.publicationDate = attributes.Find("PublicationDate").Text()
			}
			imageLabels := []string{
				"SmallImage",
				"MediumImage",
				"LargeImage",
			}
			images := map[string]ImageInfo{}
			for _, imageLabel := range imageLabels {
				xml := item.Find(imageLabel).First()
				url := xml.Find("URL").Text()
				width, _ := strconv.Atoi(xml.Find("Height").Text())
				height, _ := strconv.Atoi(xml.Find("Width").Text())
				image := ImageInfo{url, width, height}
				images[imageLabel] = image
			}
			bookInfo.images = images
			bookInfos = append(bookInfos, bookInfo)
		})
	}
	return bookInfos
}

func getXmls() []string {
	xmls := []string{}
	for i := 0; i <= 1440; i += 10 {
		path := fmt.Sprintf("xmls/%d.xml", i)
		xml, _ := ioutil.ReadFile(path)
		xmls = append(xmls, string(xml))
	}
	return xmls
}

func main() {
	xmls := getXmls()
	start := time.Now()
	bookInfos := parseXmls(xmls)
	end := time.Now()
	fmt.Printf("Number of xml: %d\n", len(xmls))
	fmt.Printf("number of books: %d\n", len(bookInfos))
	fmt.Printf("parse time: %f seconds\n", (end.Sub(start)).Seconds())
}
$ go run parse_amazon_xml.go
Number of xml: 145
number of books: 1442
parse time: 0.180461 seconds

0.18 seconds. It's slower than Python. I'm using goquery for parsing.

Parallel execution in Go

Go is slower with single thread, but Go can easily execute in parallel, so let's compare this as well. The CPU running is 2 cores and 4 threads. Write only the code changes.

parse_amazon_xml_th.go

parse_amazon_xml_th.go


//Take a channel as an argument
//Delete the return value
func parseXmls(result chan []BookInfo, xmls []string) {
	...Omitted because it is the same
	//Return the processing result to the channel (replaced return)
	result <- bookInfos
}

//Split xml array into num
func divideXmls(xmls []string, num int) [][]string {
	xmlsNum := len(xmls)
	size := xmlsNum / num
	result := [][]string{}
	for i := 0; i < num; i++ {
		start := size * i
		end := size * (i + 1)
		if i == (num - 1) {
			end = xmlsNum
		}
		result = append(result, xmls[start:end])
	}
	return result
}

func main() {
	allXmls := getXmls()
	//Split xml into 4
	divXmls := divideXmls(allXmls, 4)
	start := time.Now()

	result := make(chan []BookInfo)
	//Execute in 4 threads
	for _, xmls := range divXmls {
		go parseXmls(result, xmls)
	}
	//Receive processing results from channels and combine them into one
	bookInfos := []BookInfo{}
	for _, _ = range divXmls {
		bookInfos = append(bookInfos, <-result...)
	}

	end := time.Now()
	fmt.Printf("Number of xml: %d\n", len(allXmls))
	fmt.Printf("number of books: %d\n", len(bookInfos))
	fmt.Printf("parse time: %f seconds\n", (end.Sub(start)).Seconds())
}
$ go run parse_amazon_xml_th.go
Number of xml: 145
number of books: 1442
parse time: 0.084918 seconds

0.084 seconds. It has doubled.

Summary

Implementation speed
Python (lxml) 0.140 seconds
Go (goquery)1 thread 0.180 seconds
Go (goquery)4 threads 0.084 seconds

Go only by executing in parallel (There is no merit of Go unless it is executed in parallel)

Recommended Posts

Compare xml parsing speeds with Python and Go
Python with Go
Programming with Python and Tkinter
Python and hardware-Using RS232C with Python-
Generate XML (RSS) with Python
python with pyenv and venv
Process feedly xml with Python.
Works with Python and R
Compare HTTP GET / POST with cURL (command) and Python (programming)
Sample of HTTP GET and JSON parsing with python of pepper
Communicate with FX-5204PS with Python and PyUSB
Shining life with Python and OpenCV
Robot running with Arduino and python
Install Python 2.7.9 and Python 3.4.x with pip.
Neural network with OpenCV 3 and Python 3
Scraping with Node, Ruby and Python
Scraping with Python, Selenium and Chromedriver
Scraping with Python and Beautiful Soup
JSON encoding and decoding with python
Hadoop introduction and MapReduce with Python
[GUI with Python] PyQt5-Drag and drop-
Compare DCGAN and pix2pix with keras
Reading and writing NetCDF with Python
I played with PyQt5 and Python3
Reading and writing CSV with Python
Compare Python and JavaScript array loops
Speed comparison of Python XML parsing
[Python] Parsing randomly generated XML [ElementTree]
Multiple integrals with Python and Sympy
Process Pubmed .xml data with python
Coexistence of Python2 and 3 with CircleCI (1.0)
Easy modeling with Blender and Python
Sugoroku game and addition game with python
FM modulation and demodulation with Python
Communicate between Elixir and Python with gRPC
Data pipeline construction with Python and Luigi
Calculate and display standard weight with python
Monitor Mojo outages with Python and Skype
FM modulation and demodulation with Python Part 3
Process Pubmed .xml data with python [Part 2]
Python installation and package management with pip
Using Python and MeCab with Azure Databricks
Compare raw TensorFlow with tf.contrib.learn and Keras
POST variously with Python and receive with Flask
Capturing images with Pupil, python and OpenCV
Fractal to make and play with Python
A memo with Python2.7 and Python3 on CentOS
Let's try gRPC with Go and Docker
Use PIL and Pillow with Cygwin Python
Create and decrypt Caesar cipher with python
CentOS 6.4 with Python 2.7.3 with Apache with mod_wsgi and Django
Reading and writing JSON files with Python
Dealing with "years and months" in Python
I installed and used Numba with Python3.5
Tweet analysis with Python, Mecab and CaboCha
Linking python and JavaScript with jupyter notebook
Traffic monitoring with Kibana, ElasticSearch and Python
FM modulation and demodulation with Python Part 2
Encrypt with Ruby (Rails) and decrypt with Python
Easily download mp3 / mp4 with python and youtube-dl!
Operate home appliances with Python and IRKit