Category: Python
To run need to do two things (while in folder of python filename.py): 1) Run worker by executing the python program with the “worker” arguement: $ celery –A tasks worker –loglevel=info 2) Call the task aka run: $ python filename.py ####################### # grabhtml.py import requests from html import unescape class GrabHTML(object): def __init__(self): pass…
Python Script to insert CSV File into SQL Server Database import pandas as pd import csv import pyodbc import sys, os USERNAME = ‘sa’ PASSWORD = ‘password’ SERVER = ‘server’ DATABASE = ‘DATA’ DRIVERNAME = ‘ODBC Driver 13 for SQL Server’ cnxn = pyodbc.connect(‘Driver={‘+DRIVERNAME+’};Server=’+SERVER+’;Database=’+DATABASE+’;uid=’+USERNAME+’;pwd=’+PASSWORD) cur = cnxn.cursor() ##### using pandas grab first row for column…
I constantly use docfetcher, indexing documents (pdfs, excel, word, html, etc DocFetcher website) and xplorer2 for viewing/storing results in scrap container (xplorer2 website). One thing that has been bugging me for a while is how when I copy a list of documents from DocFetcher I’ve had to copy and to excel before I could paste into…
While this is by no means perfect, but it got the job done. If interested in using need to change (highlighted white below) the website url, name of calendar id, and start_date. #! py27wimport os, timefrom datetime import datetimefrom datetime import datefrom datetime import timedeltafrom selenium import webdriverfrom selenium.webdriver.firefox.firefox_profile import FirefoxProfilefrom selenium.common.exceptions import NoSuchElementExceptionfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as EC fp = webdriver.FirefoxProfile()fp.set_preference(‘browser.download.folderList’, 2)fp.set_preference(“browser.download.manager.showWhenStarting”, False)fp.set_preference(‘browser.download.dir’, os.getcwd())fp.set_preference(“browser.helperApps.neverAsk.saveToDisk”, ‘application/vnd.ms-excel’)fp.set_preference(“browser.download.dir”, “c:\tmp”);driver = webdriver.Firefox(firefox_profile=fp)driver.get(‘https://www.zacks.com/earnings/earnings-reports’) def click_calendar(): try: element_xpath = ‘//*[@id=”earnings_release”]/div[1]/p/a’ element = WebDriverWait(driver, 10).until( lambda driver : driver.find_element_by_xpath(element_xpath).click() ) finally: print “clicked calendar” def click_prev_day(x): s = ‘datespan_%d’ % (x) try: WebDriverWait(driver, 10).until( lambda driver : driver.find_element_by_id(s).click() ) except: result = False else: result = True return result def click_export(): try: element = WebDriverWait(driver, 10).until( lambda driver : driver.find_element_by_id(‘export_excel’).click() ) except: result = False else: result = True return result def click_prev_month(): try: driver.find_element_by_id(‘prevCal’).click() except: result = False else: result = True i = 31 while i > 27: try: click_prev_day(i) return False except: print ‘could not find %s in prev month’ % (i) i -= 1 def subtract_day(n): y = n – 1 return y def start_date(): return datetime(2016,2,29) def click_to_start_date(): start_date = datetime(2016,2,28) a = date.today() b = start_date c = a.month – b.month if c > 0: click_calendar() while c > 0: click_prev_month() c -= 1 try: click_prev_day(31) except: click_prev_day(30) def main(): #click_to_start_date() #sdate = start_date() m = 12 while m > 0: m -= 1 for x in range(31,0,-1): click_calendar() click_prev_day(x) click_export() click_calendar() click_prev_month() if __name__ == ‘__main__’: main() Few areas where need to improve: click_prev_month() – had little difficulty…
%matplotlib inlinefrom pandas.io.data import DataReaderfrom datetime import datefrom dateutil.relativedelta import relativedelta goog = DataReader(‘GOOG’, “yahoo”, date.today() + relativedelta(months=-3)) goog.tail() goog.plot(y=’Adj Close’);
#! py35# open file, process, and save output to new file stdin = r’C:projectstxt.txt’stdout = r’C:projectstext.csv’ with open(stdin, ‘r’, encoding=’utf-8′) as f: # open input file lines = f.readlines() # iterate one line at time for line in lines: # for each line l = line.split(‘ ‘, 1)[1] # remove left of first space with open(stdout, ‘a’) as fout: # open output file fout.write(l) …
import re result = re.sub(p, subst, test_str)p = re.compile(ur‘(?i)b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^s()<>{}[]]+|([^s()]*?([^s()]+)[^s()]*?)|([^s]+?))+(?:([^s()]*?([^s()]+)[^s()]*?)|([^s]+?)|[^s`!()[]{};:’”.,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)b/?(?!@)))’)test_str = u“”subst = u“” Even longer Regex for finding URL
Linkedin’s Industry Codes import sqlite3, os, csv conn = sqlite3.connect(‘industry_classifications’)curs = conn.cursor() tblcmd = ‘create table linkedin_industries (code int(3), groups char(60), description char(60))’curs = conn.execute(‘select * from linkedin_industries’)names = [description[0] for description in curs.description]namescurs.execute(tblcmd)curs.rowcount file = r’C:projectsLinkedinIndustry Codes.csv’ with open(file, ‘r’, encoding=’utf-8′) as f: readCSV = csv.reader(f, delimiter=’,’) for row in readCSV: curs.execute(‘INSERT INTO linkedin_industries (code, groups, description) VALUES (?, ?, ?)’, row) curs.execute(‘select * from linkedin_industries’)for rec in curs.fetchall(): print(rec) conn.commit()