python downloader_downloader.py

#!/usr/bin/python

# -*- coding: utf-8 -*-

import json

import os

import time

import logging

import threading

import requests

from singleton import singleton

from thread_result import ThreadResult

from threadpool import ThreadPool

from valid_param import valid_param, multi_type, null_type

class Downloader(object):

@valid_param(session=null_type(requests.Session))

def __init__(self, session=None) -> None:

super().__init__()

# 会话对象

self._session = session

if not self._session:

self._session = requests.session()

self._session.headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '

'Chrome/57.0.2987.133 Safari/537.36',

'Connection': 'Keep-alive',

'Accept-Ranges': 'bytes',

'Accept-Language': 'zh-CN',

'Accept-Encoding': 'gzip, deflate, sdch',

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'

}

@valid_param(url=(str, '0 != len(x)'), file_path=null_type(str), file_name=null_type(str),

forced=bool, retry_count=int, thread_num=int)

def download(self, url, file_dir='', file_name='', forced=False, resume=True, retry_count=0, thread_num=5):

'''

download file

:param url: url address

:param file_dir: file save dir, if it is empty(None or empty string), the current path is used

:param file_name: file name, if it is empty(None or empty string), the last part of url is used

:param forced: whether to force the download, default is False

:param resume: whether to continue from the previous location to download, default is True

:param retry_count: retry count, default is 0

:param thread_num: the number of subroutines downloaded. default is 5

:return: result

'''

for i in range(retry_count+1):

result = self._download(url, file_dir, file_name, forced, resume, thread_num)

if result:

return result

if i != retry_count:

logging.info('retry-%s download %s.' % (i+1, url))

return False

def _download(self, url, file_dir, file_name, forced, resume, thread_num):

# default parameter

file_dir = self._format_directory(file_dir)

file_name = self._format_file_name(file_name, url)

file_path = file_dir + file_name

info_dir = file_dir + 'info/'

info_path = info_dir + file_name + '.info'

temp_path = file_dir + file_name + '.download'

# 如果文件存在且不要求强制下载

if os.path.exists(file_path) and not forced:

logging.info('file [%s] already exists, no download required' % file_path)

return True

# get file size

try:

res = self._session.head(url)

file_size = int(res.headers.get('content-length'))

if file_size:

logging.debug('url [%s] size is %s' % (url, self._conversion_size_unit(file_size)))

else:

logging.error('url [%s] does not exist.' % url)

return False

except Exception as e:

logging.error('Exception %s.' % e)

logging.error('url [%s] does not support download.' % url)

return False

# info

info_dict = {}

if os.path.exists(info_path) and os.path.exists(temp_path):

# 如果保存有 info文件与 temp文件, 则读取以前的信息

with open(info_path, 'r') as info_fp:

info_dict = json.load(info_fp)

else:

# 否则清空记录

if os.path.exists(info_path):

os.remove(info_path)

if os.path.exists(temp_path):

os.remove(temp_path)

info_dict['url'] = url

info_dict['path'] = file_path

info_dict['name'] = file_name

info_dict['size'] = file_size

info_dict['time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())

if not info_dict.get('part'):

info_dict['part'] = {}

logging.info('ready to download the file url [%s] to the path [%s]. file size is %s' %

(url, file_path, self._conversion_size_unit(file_size)))

# if the directory does not exist, create the directory

if not os.path.exists(info_dir):

os.makedirs(info_dir)

# if the directory does not exist, create the directory

if not os.path.exists(file_dir):

os.makedirs(file_dir)

# 如果文件不存在, 则创建文件

if not os.path.exists(temp_path):

with open(temp_path, 'w+') as fp:

fp.truncate(file_size)

# 启动多线程写文件

start_time = time.clock()

lock = threading.Lock()

# 开启线程池

if thread_num <= 0:

thread_num = 5

pool = ThreadPool(thread_num)

# 此处不能使用 'a' 模式

# 因为 'a' 模式下, write() 操作追加到 end, 而忽略当前的 seek 位置

with open(temp_path, 'rb+') as fp:

# 下载单元的大小

DOWNLOAD_PART_SIZE = 1024*1024

# 计算任务数量

task_num = (file_size + DOWNLOAD_PART_SIZE - 1) // DOWNLOAD_PART_SIZE

for i in range(task_num):

start = DOWNLOAD_PART_SIZE * i

if i != task_num - 1:

# 需要减去与上一个块重合的一个字节

end = start + DOWNLOAD_PART_SIZE - 1

else:

# 最后一块需要加上整除剩下的字节, 也就是直到末尾

end = file_size

if resume and info_dict['part'].get('%s-%s' % (start, end)):

continue

pool.apply_async(self._download_part, (url, start, end, fp, info_dict, lock))

pool.close()

pool.wait()

# 判断结果

result = True

for task in pool.task_list():

if not task.get_result():

result = False

info_dict['result'] = result

with open(info_path, 'w+') as info_fp:

info_fp.write(json.dumps(info_dict))

if result and 0 != len(pool.task_list()):

os.rename(temp_path, file_path)

logging.info('file [%s] size [%s]. download complete. time: %0.3f s' %

(file_path, self._conversion_size_unit(file_size), time.clock() - start_time))

return True

else:

logging.error('file [%s] download failure.' % file_path)

return False

@valid_param(url=(str, '0 != len(x)'), start=int, end=int)

def _download_part(self, url, start, end, fp, info_dict, lock):

'''

download file part

:param url: url

:param start: start

:param end: end

:param fp: file handle

:param lock: threading lock

:return: result

'''

part_headers = {

'Range': 'bytes=%d-%d' % (start, end)

}

try:

logging.debug('start download url [%s] part [%s - %s].' % (url,

self._conversion_size_unit(start),

self._conversion_size_unit(end)))

r = self._session.get(url, headers=part_headers, timeout=30, stream=True)

content = r.content

if not content:

return False

if lock:

lock.acquire()

fp.seek(start)

fp.write(content)

fp.flush()

info_dict['part']['%s-%s' % (start, end)] = True

if lock:

lock.release()

logging.debug('download url [%s] part [%s - %s] complete.' % (url,

self._conversion_size_unit(start),

self._conversion_size_unit(end)))

return True

except Exception as e:

logging.error('Exception %s.' % e)

logging.error('download url [%s] part [%s - %s] failure.' % (url,

self._conversion_size_unit(start),

self._conversion_size_unit(end)))

@valid_param(file_name=multi_type(str, None), url=(str, '0 != len(x)'))

def _format_file_name(self, file_name, url):

'''

format file name

:param file_name: file name

:param url: url

:return: format file path

'''

if file_name is None or 0 == len(file_name):

file_name = url.split('/')[-1]

file_name = file_name.replace('', ' ').replace('|', ' ')\

.replace(':', ' ').replace('\"', ' ').replace('*', ' ').replace('?', ' ')\

.replace('/', ' ').replace('\\', ' ')

return file_name

@valid_param(dir=multi_type(str, None))

def _format_directory(self, directory):

'''

format directory

:param directory: directory

:return: format directory

'''

if directory is None or 0 == len(directory):

directory = './'

elif '\\' != directory[-1] and '/' != directory[-1]:

directory += '/'

directory = directory.replace('', ' ').replace('|', ' ')\

.replace(':', ' ').replace('\"', ' ').replace('*', ' ').replace('?', ' ')

return directory

@valid_param(size=int)

def _conversion_size_unit(self, size):

if not isinstance(size, int):

raise TypeError('size type not is int type.')

if size < 1024:

return '%dB' % size

elif 1024 <= size < 1024 * 1024:

return '%0.3fKB' % (size / 1024)

elif 1024 * 1024 <= size < 1024 * 1024 * 1024:

return '%0.3fMB' % (size / 1024 / 1024)

elif 1024 * 1024 * 1024 <= size:

return '%0.3fGB' % (size / 1024 / 1024 / 1024)

一键复制

编辑

Web IDE

原始数据

按行查看

历史