To run need to do two things (while in folder of python filename.py):
1) Run worker by executing the python program with the “worker” arguement:
$ celery –A tasks worker –loglevel=info
2) Call the task aka run:
$ python filename.py
#######################
# grabhtml.py
import requests
from html import unescape
class GrabHTML(object):
def __init__(self):
pass
def get_html(self, url):
headers = {'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/57.0.2960.0 Safari/537.36'}
r = requests.get(url, headers=headers, timeout=2.0)
try:
html = unescape(r.content)
except:
html = r.content
return html, r.status_code
#####################
# celery_grabhtml_redis.py
# from grabhtml import GrabHTML # need to import class if using seperate files
from celery import Celery
# pip install celery==3.1.21
app = Celery('tasks', broker='amqp://localhost/', backend='redis://localhost/6')
# using docker for both broker and backend
# docker run -d --hostname my-rabbit --name some-rabbit rabbitmq:management
# docker pull redis
@app.task
def scrape(url):
print("-> Starting: [{}]".format(url))
parser = GrabHTML()
try:
html, status = parser.get_html(url)
if status == 200:
app.backend.set(str(url), html)
print("-> saved html: [{}]".format(url))
except:
print('error: {}'.format(url))
r.set("error:"+str(url),"")
# celery -A celery_grabhtml_redis worker --loglevel=info
# ^ run above celery command in terminal while situated in same folder as current file
# from celery.task.control import discard_all
# discard_all()
# ^ use above to clear celery queue
######################
# celery_grabhtml_redis.py
# from celery_grabhtml_redis import scrape # uncomment if choose seperate file
file = r'b:Curls.txt'
urls = []
with open(file, 'r') as f:
urls = f.read().splitlines()
def produce():
for url in urls:
try:
scrape.delay(str(url))
print("* Submitted: [{}]".format(url))
except:
print("ERROR ", url)
produce()
#####################
# urls.txt # example
#http://www.vanguard.com
#http://www.blackrock.com
#http://www.fidelity.com
#http://www.nbim.no
#http://www.troweprice.com
#http://www.wellington.com
#http://www.northerntrust.com
#http://www.mfs.com
#http://www.jpmorganfunds.com
#http://www.us.dimensional.com
#http://www.lgim.com
#http://www.tiaa-cref.org
#http://www.invesco.com
#http://www.mcm.com
#http://www.geodecapital.com
#http://www.columbiamanagement.com
#http://www.dodgeandcox.com
#http://www.oppenheimerfunds.com
#http://www.alliancebernstein.com
#http://www.apg.nl
#http://www.franklintempleton.com
#http://www.jennison.com
#http://www.gsam.com