Commit afa44e2b authored by Иван Бегтин's avatar Иван Бегтин

Первоначальный импорт

parents
Pipeline #10 canceled with stages
files
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import click
import json
import os
import glob
import csv
import datetime
import io
import xlwt
import sys
import yaml
import shutil
import hashlib
import zlib
from lxml.html import fromstring, etree
from tabulate import tabulate
import requests
import re
from pymongo import MongoClient, DESCENDING, ASCENDING
CUSTOMS_URL = 'http://stat.customs.ru/apex/f?p=201:7:4435457763961247::NO'
PREFIX_URL = 'http://stat.customs.ru/apex/'
STORAGE_PATH = 'files'
def get_filename_from_cd(cd):
"""
Get filename from content-disposition
"""
if not cd:
return None
fname = re.findall('filename=(.+)', cd)
if len(fname) == 0:
return None
return fname[0].replace('/', '_')
def _store_files(data):
s = requests.Session()
for record in reversed(data):
r = s.head(record['url'], allow_redirects=True)
filename = get_filename_from_cd(r.headers.get('content-disposition'))
out = os.path.join(STORAGE_PATH, filename)
if os.path.exists(out):
print('Filename %s exists' % (filename))
else:
r = s.get(record['url'], allow_redirects=True)
print('Filename %s downloaded' % (filename))
f = open(out, 'wb')
f.write(r.content)
f.close()
def listfiles():
"""List files"""
data = requests.get(CUSTOMS_URL)
# data = open('page.html', encoding="utf-8").read()
js = etree.HTML(data.text).xpath('//script')[2].text
js = js.split('=', 1)[1].strip()
js = js.replace('attributes:', '"attributes":').replace('data:', '"data":').replace('children:', '"children":').strip().strip(';')
# print(js)
# js = lxml.etree.HTML(s).find('.//body/script').text
jsdata = json.loads(js)
all = []
for yd in jsdata:
year = yd['data']['title']
for gd in yd['children']:
group = gd['data']['title']
for dd in gd['children']:
doc = dd['data']['title']
doc_url = PREFIX_URL + dd['data']['attributes']['href']
all.append({'year': year, 'group' : group, 'docname' : doc, 'url' : doc_url})
print(tabulate(all))
writer = csv.DictWriter(open('filelist.csv', 'w'), fieldnames=['year', 'group', 'docname', 'url'])
writer.writeheader()
for r in all:
writer.writerow(r)
_store_files(all)
return all
@click.group()
def cli1():
pass
@cli1.command()
def collect():
"""Collect customs files"""
listfiles()
pass
cli = click.CommandCollection(sources=[cli1])
if __name__ == '__main__':
cli()
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment