J'analyse xml tous les jours sur http://manga-now.com, mais j'ai comparé comment ce serait plus rapide si je changeais l'implémentation Python en Go. Puisque je veux comparer uniquement la vitesse d'analyse, je mesure la vitesse à partir de l'état où xml est lu dans la mémoire jusqu'à ce que l'acquisition de chaque élément soit terminée.

Télécharger xml

Tout d'abord, utilisez l'API Amazon Product Advertising pour supprimer le xml des informations du livre et l'enregistrer dans un fichier.

get_books_xml.go

$ mkdir xmls
$ go run get_books_xml.go

Si vous remplacez AccessKey, SecretKey et AssociateTag par ceux appropriés et que vous l'exécutez, 145 fichiers seront enregistrés dans le répertoire xmls. Un fichier contient des informations pour jusqu'à 10 livres, pour un total de 1442 livres.

Exécuter en Python

parse_amazon_xml.py

`parse_amazon_xml.py`


# -*- coding:utf-8 -*-
import time
from lxml import objectify


class ImageInfo:
	def __init__(self):
		self.url = ''
		self.width = ''
		self.height = ''

class BookInfo:
	def __init__(self):
		self.asin = ''
		self.title = ''
		self.binding = ''
		self.author = ''
		self.publisher = ''
		self.publicationDate = ''
		self.images = {}


def getText(dom, tag):
	return getattr(dom, tag).text if tag in dom else ''


def parseXmls(xmls):
	bookInfos = []
	for xml in xmls:
		dom = objectify.fromstring(xml)
		for item in dom.Items.Item:
			bookInfo = BookInfo()
			bookInfo.asin = item.ASIN.text

			attr = item.ItemAttributes
			bookInfo.title = getText(attr, 'Title')
			bookInfo.binding = getText(attr, 'Binding')
			bookInfo.author = getText(attr, 'Author')
			bookInfo.publisher = getText(attr, 'Publisher')
			bookInfo.publicationDate = getText(attr, 'PublicationDate')

			imageLabels = ['SmallImage', 'MediumImage', 'LargeImage']
			for imageLabel in imageLabels:
				image = ImageInfo()
				if imageLabel in item:
					image.url = getattr(item, imageLabel).URL.text
					image.width = int(getattr(item, imageLabel).Width.text)
					image.height = int(getattr(item, imageLabel).Height.text)
				bookInfo.images[imageLabel] = image

			bookInfos.append(bookInfo)

	return bookInfos


def getXmls():
	xmls = []
	for i in range(0, 1440+1, 10):
		path = 'xmls/{}.xml'.format(i)
		with open(path, 'r') as f:
			xml = f.read()
			xmls.append(xml)
	return xmls


def main():
	xmls = getXmls()
	start = time.time()
	bookInfos = parseXmls(xmls)
	end = time.time()
	print('Nombre de xml: {}'.format(len(xmls)))
	print('nombre de livres: {}'.format(len(bookInfos)))
	print('temps d'analyse: {}Secondes'.format(end - start))


if __name__ == '__main__':
    main()

$ python parse_amazon_xml.py
Nombre de xml: 145
nombre de livres: 1442
temps d'analyse: 0.14079904556274414 secondes

C'était 0,140 seconde. J'utilise le module lxml pour l'analyse.

Courir en Go

parse_amazon_xml.go

`parse_amazon_xml.go`


package main

import (
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"io/ioutil"
	"strconv"
	"strings"
	"time"
)

type ImageInfo struct {
	url    string
	width  int
	height int
}

type BookInfo struct {
	asin            string
	title           string
	binding         string
	author          string
	publisher       string
	publicationDate string
	images          map[string]ImageInfo
}

func parseXmls(xmls []string) []BookInfo {
	bookInfos := []BookInfo{}
	for _, xml := range xmls {
		dom, _ := goquery.NewDocumentFromReader(strings.NewReader(xml))
		dom.Find("Item").Each(func(_ int, item *goquery.Selection) {
			bookInfo := BookInfo{}
			bookInfo.asin = item.Find("ASIN").Text()
			attributes := item.Find("ItemAttributes").First()
			if attributes.Length() > 0 {
				bookInfo.title = attributes.Find("Title").Text()
				bookInfo.binding = attributes.Find("Binding").Text()
				bookInfo.author = attributes.Find("Author").Text()
				bookInfo.publisher = attributes.Find("Publisher").Text()
				bookInfo.publicationDate = attributes.Find("PublicationDate").Text()
			}
			imageLabels := []string{
				"SmallImage",
				"MediumImage",
				"LargeImage",
			}
			images := map[string]ImageInfo{}
			for _, imageLabel := range imageLabels {
				xml := item.Find(imageLabel).First()
				url := xml.Find("URL").Text()
				width, _ := strconv.Atoi(xml.Find("Height").Text())
				height, _ := strconv.Atoi(xml.Find("Width").Text())
				image := ImageInfo{url, width, height}
				images[imageLabel] = image
			}
			bookInfo.images = images
			bookInfos = append(bookInfos, bookInfo)
		})
	}
	return bookInfos
}

func getXmls() []string {
	xmls := []string{}
	for i := 0; i <= 1440; i += 10 {
		path := fmt.Sprintf("xmls/%d.xml", i)
		xml, _ := ioutil.ReadFile(path)
		xmls = append(xmls, string(xml))
	}
	return xmls
}

func main() {
	xmls := getXmls()
	start := time.Now()
	bookInfos := parseXmls(xmls)
	end := time.Now()
	fmt.Printf("Nombre de xml: %d\n", len(xmls))
	fmt.Printf("nombre de livres: %d\n", len(bookInfos))
	fmt.Printf("temps d'analyse: %f secondes\n", (end.Sub(start)).Seconds())
}

$ go run parse_amazon_xml.go
Nombre de xml: 145
nombre de livres: 1442
temps d'analyse: 0.180461 secondes

0,18 seconde. C'est plus lent que Python. J'utilise goquery pour l'analyse.

Exécution parallèle dans Go

Go est plus lent avec un seul thread, mais Go peut facilement s'exécuter en parallèle, alors comparons cela également. Le processeur en cours d'exécution est composé de 2 cœurs et de 4 threads. N'écrivez que les modifications de code.

parse_amazon_xml_th.go

`parse_amazon_xml_th.go`


//Prenez une chaîne comme argument
//Supprimer la valeur de retour
func parseXmls(result chan []BookInfo, xmls []string) {
	...Omis parce que c'est la même chose
	//Renvoie le résultat du traitement au canal (retour remplacé)
	result <- bookInfos
}

//Diviser un tableau de xml en num
func divideXmls(xmls []string, num int) [][]string {
	xmlsNum := len(xmls)
	size := xmlsNum / num
	result := [][]string{}
	for i := 0; i < num; i++ {
		start := size * i
		end := size * (i + 1)
		if i == (num - 1) {
			end = xmlsNum
		}
		result = append(result, xmls[start:end])
	}
	return result
}

func main() {
	allXmls := getXmls()
	//Diviser le XML en 4
	divXmls := divideXmls(allXmls, 4)
	start := time.Now()

	result := make(chan []BookInfo)
	//Exécuter en 4 threads
	for _, xmls := range divXmls {
		go parseXmls(result, xmls)
	}
	//Recevez les résultats du traitement des canaux et combinez-les en un seul
	bookInfos := []BookInfo{}
	for _, _ = range divXmls {
		bookInfos = append(bookInfos, <-result...)
	}

	end := time.Now()
	fmt.Printf("Nombre de xml: %d\n", len(allXmls))
	fmt.Printf("nombre de livres: %d\n", len(bookInfos))
	fmt.Printf("temps d'analyse: %f secondes\n", (end.Sub(start)).Seconds())
}

$ go run parse_amazon_xml_th.go
Nombre de xml: 145
nombre de livres: 1442
temps d'analyse: 0.084918 secondes

0,084 seconde. Il a doublé.

Résumé

la mise en oeuvre	la vitesse
Python (lxml)	0.140 secondes
Go (goquery)1 fil	0.180 secondes
Go (goquery)4 fils	0.084 secondes

Aller uniquement en s'exécutant en parallèle (il n'y a aucun mérite de Go à moins qu'il ne soit exécuté en parallèle)

Comparez les vitesses d'analyse XML avec Python et Go