To run need to do two things (while in folder of python
1) Run worker by executing the python program with the “worker” arguement:
$ celery –A tasks worker –loglevel=info
2) Call the task aka run:
$ python
####################### # import requests from html import unescape class GrabHTML(object): def __init__(self): pass def get_html(self, url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/57.0.2960.0 Safari/537.36'} r = requests.get(url, headers=headers, timeout=2.0) try: html = unescape(r.content) except: html = r.content return html, r.status_code
##################### # # from grabhtml import GrabHTML # need to import class if using seperate files from celery import Celery # pip install celery==3.1.21 app = Celery('tasks', broker='amqp://localhost/', backend='redis://localhost/6') # using docker for both broker and backend # docker run -d --hostname my-rabbit --name some-rabbit rabbitmq:management # docker pull redis @app.task def scrape(url): print("-> Starting: [{}]".format(url)) parser = GrabHTML() try: html, status = parser.get_html(url) if status == 200: app.backend.set(str(url), html) print("-> saved html: [{}]".format(url)) except: print('error: {}'.format(url)) r.set("error:"+str(url),"") # celery -A celery_grabhtml_redis worker --loglevel=info # ^ run above celery command in terminal while situated in same folder as current file # from celery.task.control import discard_all # discard_all() # ^ use above to clear celery queue
###################### # # from celery_grabhtml_redis import scrape # uncomment if choose seperate file file = r'b:Curls.txt' urls = [] with open(file, 'r') as f: urls = def produce(): for url in urls: try: scrape.delay(str(url)) print("* Submitted: [{}]".format(url)) except: print("ERROR ", url) produce()
##################### # urls.txt # example # # # # # # # # # # # # # # # # # # # # # # #