To run need to do two things (while in folder of python filename.py):
1) Run worker by executing the python program with the “worker” arguement:
$ celery –A tasks worker –loglevel=info
2) Call the task aka run:
$ python filename.py
####################### # grabhtml.py import requests from html import unescape class GrabHTML(object): def __init__(self): pass def get_html(self, url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/57.0.2960.0 Safari/537.36'} r = requests.get(url, headers=headers, timeout=2.0) try: html = unescape(r.content) except: html = r.content return html, r.status_code
##################### # celery_grabhtml_redis.py # from grabhtml import GrabHTML # need to import class if using seperate files from celery import Celery # pip install celery==3.1.21 app = Celery('tasks', broker='amqp://localhost/', backend='redis://localhost/6') # using docker for both broker and backend # docker run -d --hostname my-rabbit --name some-rabbit rabbitmq:management # docker pull redis @app.task def scrape(url): print("-> Starting: [{}]".format(url)) parser = GrabHTML() try: html, status = parser.get_html(url) if status == 200: app.backend.set(str(url), html) print("-> saved html: [{}]".format(url)) except: print('error: {}'.format(url)) r.set("error:"+str(url),"") # celery -A celery_grabhtml_redis worker --loglevel=info # ^ run above celery command in terminal while situated in same folder as current file # from celery.task.control import discard_all # discard_all() # ^ use above to clear celery queue
###################### # celery_grabhtml_redis.py # from celery_grabhtml_redis import scrape # uncomment if choose seperate file file = r'b:Curls.txt' urls = [] with open(file, 'r') as f: urls = f.read().splitlines() def produce(): for url in urls: try: scrape.delay(str(url)) print("* Submitted: [{}]".format(url)) except: print("ERROR ", url) produce()
##################### # urls.txt # example #http://www.vanguard.com #http://www.blackrock.com #http://www.fidelity.com #http://www.nbim.no #http://www.troweprice.com #http://www.wellington.com #http://www.northerntrust.com #http://www.mfs.com #http://www.jpmorganfunds.com #http://www.us.dimensional.com #http://www.lgim.com #http://www.tiaa-cref.org #http://www.invesco.com #http://www.mcm.com #http://www.geodecapital.com #http://www.columbiamanagement.com #http://www.dodgeandcox.com #http://www.oppenheimerfunds.com #http://www.alliancebernstein.com #http://www.apg.nl #http://www.franklintempleton.com #http://www.jennison.com #http://www.gsam.com