Hduoj用户AC题数统计爬虫
趁着前几天出去比赛的空余时间瞎写完了
一向写python都是现写现查库怎么用
写这个主要是为了班里统计方便 ,但现在应该用不上了2333
写的挺垃圾,也是第一次搞这个,因为库太好用了感觉自己写的也没啥水平哈哈
其中主要点是判断指定日期的做题AC数
用了下用户实时的提交页面
1 |
http://acm.hdu.edu.cn/status.php?first=&pid=&user=aaa&lang=0&status=0 |
1.爬下直到指定日期最后所有做题情况
2.判断时间是否符合,顺便学习了下datetime
- 因为日期是从前往后排所以 大于当前日期的跳过,直到遇见最后日期跳出循环
- 需要注意的就是如果指定日期加之前这个用户没有答题过 会造成死循环 所以判断下当前是否为最后一页 因为get参数没有页数所以就判断first是否重复就ok
3.判断是否ac
4.丢到dict
分享一下渣渣代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# encoding=utf-8 import requests import re import xlrd import xlwt from bs4 import BeautifulSoup import datetime def save_mysql(): pass def save_excel(users, Allsolved, pid, sum): book = xlwt.Workbook(encoding='utf-8') # 表格初始化 sheet1 = book.add_sheet('sheet1', cell_overwrite_ok=True) heads = ['ID', u'指定日期累计AC题数' + '(' + start + 'to' + end + ')', u'总共已AC题数', u'指定日期AC题号'] print u'\n准备将数据存入表格...' ii = 0 # 表格初始化用 for head in heads: sheet1.write(0, ii, head) ii += 1 ID_col = 0 weekid = 3 week_solved_col = 1 solved_col = 2 row = 1 sheet1.col(week_solved_col).width = 256 * 20 for user in users: sheet1.write(row, ID_col, user) row += 1 row = 1 for solved in Allsolved: sheet1.write(row, solved_col, solved) row += 1 row = 1 for wid in pid: sheet1.write(row, weekid, wid) row += 1 row = 1 for wsum in sum: sheet1.write(row, week_solved_col, wsum) row += 1 book.save('Acm' + start + 'to' + end + '.xls') print u'\n录入成功!' def run(users): All_Solved = [] ID = [] SUM = [] for user in users: ########目前已做总题数 userpages = requests.get("http://acm.hdu.edu.cn/userstatus.php?user=" + user) n = re.compile(r'>(.*)</h1>') s = re.compile(r'Solved</td><td align=center>([0-9]+)<') name = n.search(userpages.text) solved = s.search(userpages.text) print name.group(1) + " " + solved.group(1) All_Solved.append(solved.group(1)) ########每日统计 sum = 0 print user first = '99999999' run_forever = True repid = ['|'] # 防止重复+统计题号 while run_forever: req = requests.get( "http://acm.hdu.edu.cn/status.php?first=" + first + "&pid=&user=" + user + "&lang=0&status=0") soup = BeautifulSoup(req.text, 'lxml') tables = soup.findAll('table') tab = tables[3] old = [] new = [] dict = {'Runid': '123', 'Subtime': '2017', 'Status': 'ac', 'Id': '123'} for tr in tab.findAll('tr')[1:]: i = 1 for td in tr.findAll('td')[:4]: if i == 1: dict['Runid'] = td.getText() Runid = td.getText() elif i == 2: dict['Subtime'] = td.getText() elif i == 3: dict['Status'] = td.getText() else: dict['Id'] = td.getText() i += 1 old.append(dict.copy()) if first == str(int(Runid) - 1): # 防止当这个用户这个时间段没做过题的情况(死循环) break for AoW in old: time = datetime.datetime.strptime(AoW['Subtime'][0:-9], '%Y-%m-%d') if time <= d2 and time >= d1: ac = 'Accepted' # print AoW if AoW['Status'] == ac: for cfid in repid: if AoW['Id'] != cfid: f = 1 # T.append(AoW['Id']) else: # 如果重复 退出循环 f = 0 break if f == 1: repid.append(AoW['Id']) # 防止重复 repid.append(' ') new.append(AoW) elif time < d1: run_forever = False # break else: pass #### for tt in new: print tt sum += len(new) first = str(int(Runid) - 1) # 最后一个 print '------' print sum print '------' ID.append(repid) SUM.append(sum) # print ID # print SUM # print SUM save_excel(users, All_Solved, ID, SUM) def main(): # 读入userID book = xlrd.open_workbook('./acmid.xlsx') sheet = book.sheet_by_name('sheet1') users = sheet.col_values(0) print users run(users) if __name__ == '__main__': print '日期输入格式:年-月-日' start = raw_input("起始日期:") end = raw_input("终止日期:") d1 = datetime.datetime.strptime(start, '%Y-%m-%d') d2 = datetime.datetime.strptime(end, '%Y-%m-%d') main() |
近期评论