Django File Upload with Streaming and Memory-Efficient Processing

Snippet

Django File Upload with Streaming and Memory-Efficient Processing

This snippet demonstrates memory-efficient file upload handling in Django using streaming and chunked processing. The ChunkedFileUpload class computes checksums without loading entire files into memory, while StreamingCSVProcessor processes CSV data row-by-row. Key features include configurable chunk sizes, duplicate detection via cache, and progress tracking for long-running operations.

snippet.py
python
import hashlib
from django.core.files.uploadedfile import UploadedFile
from django.core.cache import cache
from myapp.validators import FileValidator
 
class ChunkedFileUpload:
    CHUNK_SIZE = 8192
    MAX_MEMORY_SIZE = 10 * 1024 * 1024
 
    def __init__(self, uploaded_file: UploadedFile):
        self.file = uploaded_file
        self.checksum = None
        self.processed_chunks = 0
 
    def compute_checksum(self):
        hasher = hashlib.sha256()
        for chunk in self.file.chunks(self.CHUNK_SIZE):
            hasher.update(chunk)
        self.checksum = hasher.hexdigest()
        return self.checksum
 
    def process_in_chunks(self, processor_func):
        results = []
        for chunk in self.file.chunks(self.CHUNK_SIZE):
            result = processor_func(chunk)
            results.append(result)
            self.processed_chunks += 1
        return results
 
    def is_memory_efficient(self):
        return self.file.size > self.MAX_MEMORY_SIZE
 
class StreamingCSVProcessor:
    def __init__(self, file_upload: ChunkedFileUpload):
        self.uploader = file_upload
        self.validator = FileValidator()
        self.processed_rows = 0
 
    def process_csv_stream(self, destination):
        import csv
        from django.db import connection
 
        with connection.cursor() as cursor:
            for chunk in self.uploader.file.chunks(ChunkedFileUpload.CHUNK_SIZE):
                decoded_chunk = chunk.decode('utf-8')
                lines = decoded_chunk.strip().split('\n')
 
                for line in lines:
                    if not line.strip():
                        continue
 
                    reader = csv.reader([line])
                    row = next(reader)
 
                    if self.validator.validate_row(row):
                        self.insert_row_bulk_style(cursor, row)
                        self.processed_rows += 1
 
                        if self.processed_rows % 1000 == 0:
                            cache.set('csv_progress', self.processed_rows, 30)
 
    def insert_row_bulk_style(self, cursor, row):
        values = [self.validator.sanitize(v) for v in row]
        cursor.execute(
            'INSERT INTO imported_data (data) VALUES (%s)',
            [','.join(values)]
        )
 
def handle_large_upload(request):
    uploaded_file = request.FILES['file']
    uploader = ChunkedFileUpload(uploaded_file)
 
    if uploader.is_memory_efficient():
        checksum = uploader.compute_checksum()
        existing = cache.get(f'upload_checksum_{checksum}')
        if existing:
            return {'status': 'duplicate', 'checksum': checksum}
 
        processor = StreamingCSVProcessor(uploader)
        processor.process_csv_stream(None)
        cache.set(f'upload_checksum_{checksum}', True, 3600)
 
    return {'status': 'processed', 'rows': processor.processed_rows}

django

Breakdown

for chunk in self.file.chunks(self.CHUNK_SIZE)

Iterate over file in fixed-size chunks avoiding full file load into memory for large files

hasher.update(chunk)

Update hash accumulator with each chunk enabling SHA256 computation on arbitrarily large files

is_memory_efficient()

Check if file size exceeds threshold to determine streaming vs buffered processing approach

cache.set('csv_progress', self.processed_rows, 30)

Store progress in cache with TTL for cross-request state sharing and monitoring

cursor.execute('INSERT INTO imported_data...', [','.join(values)])

Execute raw SQL for maximum insert performance during bulk streaming import operations

Previous snippet Next snippet

More Expert Python exercises →

From your library

Django File Upload with Streaming and Memory-Efficient Processing

Related