I wrote a code that issues select to a table that stores a large number of records, outputs the result to csv, and gzips it, so make a note.
models.py
from logging import getLogger
from django.db import models
import uuid
logger = getLogger(__name__)
class Analytics(models.Model):
hit_type = models.CharField(max_length=20)
category = models.CharField(max_length=255, null=True, blank=True)
action = models.CharField(max_length=255, null=True)
label = models.CharField(max_length=255, null=True, blank=True)
value = models.IntegerField(null=True)
url = models.CharField(max_length=255, null=True)
created = models.DateTimeField(auto_now_add=True, db_index=True)
tracking_user = models.ForeignKey('app.TrackingUser', related_name='analytics')
def __str__(self):
return '%s - %s' % (self.category, self.tracking_user.uuid)
class TrackingUser(models.Model):
uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
created = models.DateTimeField(auto_now_add=True)
def __str__(self):
return str(self.uuid)
There was a model that stores data that tracks user behavior on the website as described above, but this time I wanted to create a command that can delete after acquiring Analytics records for one month and storing them in csv. .. By the way, the data of Analytics increases by about 100,000 in one day, so it is about 3 million in one month. So, as a result of doing various things, the following code was a good feeling.
delete_analytics.py
from datetime import datetime, timedelta, date
from api.models import Analytics
from pytz import timezone, utc
from django.core.management.base import BaseCommand
from django.db import transaction
from dateutil.relativedelta import relativedelta
import csv
import gzip
import shutil
import os
class Command(BaseCommand):
def add_arguments(self, parser):
# Named (optional) arguments
parser.add_argument('--target_date', metavar='target_date', type=str, nargs=None,
help='target date')
@transaction.atomic
def handle(self, *args, **options):
if options['target_date']:
delete_start_date = datetime.strptime(options['target_date'], '%Y-%m-%d').replace(day=1)
else:
this_month = date.today().replace(day=1)
delete_start_date = this_month - relativedelta(months=3)
delete_end_date = delete_start_date + relativedelta(months=1)
queryset = Analytics.objects.select_related('user', 'tracking_user').filter(created__range=(delete_start_date, delete_end_date))
#Output to csv
filename_date = delete_start_date.strftime('%Y%m')
filename = filename_date + "_event.csv"
model = queryset.model
writer = csv.writer(open(filename, 'w'))
##Writing the header part of csv
headers = []
for field in model._meta.fields:
headers.append(field.name)
writer.writerow(headers)
##Write the acquired record
for obj in queryset:
row = []
for field in headers:
val = getattr(obj, field)
if callable(val):
val = val()
row.append(val)
writer.writerow(row)
#Compress csv with gzip
with open(filename, 'rb') as gzip_in:
with gzip.open(filename + ".gz", 'wb') as gzip_out:
shutil.copyfileobj(gzip_in, gzip_out)
os.remove(filename)
queryset.delete()
If you set ./manage.py delete_analytics.py -target_date = 2017-06-01
in the directory where manage.py is located, the data for June 2017 will be output to 201706_event.csv, compressed with gzip, and then , Deleted.
The following parts were the key this time.
python
queryset = Analytics.objects.select_related( 'tracking_user').filter(created__range=(delete_start_date, delete_end_date))
If you do not write select_related ('tracking_user').
, the relation destination will also go to select, so it will take time.
When I wrote it, the command ended in 5 minutes, but if I didn't write it, it took 50 minutes to finish.
When I ran it on the server, it fell out due to lack of memory ... Think about what to do and write again if you can handle it
django recipe: dump your queryset out as a csv file . palewire
Recommended Posts