解决Hdoj3337问题的简易爬虫

这是好久前遇到的一道非主流题,当时愣是没弄明白题意。最近闲着没事翻开来看了看,并在网上找到了某大牛写的爬虫,写的真美,顿觉的有必要收藏一下。虽然现在不能完全看懂,但是我想不久的将来,当我想系统的学Python的时候,这肯定是很有用的东西。
Hdoj3337

题目非常短,主要是这句话:

There is only one line in the input. It is a sentence which implies some integer. The length of the sentence is no more than 16. The sentence contains only letters(both uppercase and lowercase), digits, and whitespace. You may assume the given sentence is written in standard English, and always implies exactly one single integer.

意思就是给你一句话,让你写出这句话“imply”的一个数字。。。然而你并不知道imply是什么意思,所以你必须知道他给的这句话是什么,因此解决这道题的标准做法就是通过OJ提供的 AC,WA,TLE,MLE 等提示来对样例进行数据挖掘,具体做法就是类似二分的东西,通过多次提交来确定每一位的字母是什么。

以下是大牛写的程序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import requests
import time
import sys
status_url = 'http://acm.hdu.edu.cn/status.php'
submit_url = 'http://acm.hdu.edu.cn/submit.php'
login_url = 'http://acm.hdu.edu.cn/userloginex.php'
homepage_url = 'http://acm.hdu.edu.cn/'
language = {}
language['G++'] = 1
language['GCC'] = 2
language['C++'] = 3
language['C'] = 4
language['Pascal'] = 5
language['Java'] = 6
language['C#'] = 7
file_data = []
previous_runid = ''
greater = 'Time Limit Exceeded'
less = 'Output Limit Exceeded'
equal = 'Wrong Answer'
username = 'your username'
userpass = 'your password'
template_input_data = '''
#include <cstdio>
#include <cstdlib>
char buffer[0x10000];//64KB
// Wrong Answer
void equal(void)
{
printf("www.hoxily.com\n");
exit(0);
}
// OLE
void less(void)
{
while (true)
{
fwrite(buffer, sizeof(char), sizeof(buffer), stdout);
}
}
// TLE
void greater(void)
{
while (true)
{
}
}

int main(void)
{
int ch;
int count = 0;
while ((ch = getchar()) != EOF)
{
if (ch < 0)
{
ch = ch + 256;
}
if (count == skip)
{
if (ch == guess)
{
equal();
}
else if (ch < guess)
{
less();
}
else
{
greater();
}
}
count++;
}
return 0;
}
'''
template_input_length = '''
#include <cstdio>
#include <cstdlib>
#define MAXLEN 0x7fffffff
#define MINLEN 0
char buffer[0x10000];//64KB
// Wrong Answer
void equal(void)
{
printf("www.hoxily.com\n");
exit(0);
}
// OLE
void less(void)
{
while (true)
{
fwrite(buffer, sizeof(char), sizeof(buffer), stdout);
}
}
// TLE
void greater(void)
{
while (true)
{
}
}

int main(void)
{
int ch;
int count = 0;
while ((ch = getchar()) != EOF)
{
count++;
}
if (count < guess)
{
less();
}
else if (count == guess)
{
equal();
}
else
{
greater();
}
return 0;
}
'''
cookies = {'exesubmitlang':'0', 'CNZZDATA1254072405': '965477099-1421749361-|1427515514'}
pid = 1000
def login(referer):
global cookies
headers = {'referer':referer, 'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0'}
r = requests.post(login_url, params={'action':'login'}, data={'username':username,'userpass':userpass, 'login':'Sign+In'}, headers = headers, cookies=cookies)
if r.cookies.get('PHPSESSID') is not None:
cookies['PHPSESSID'] = r.cookies.get('PHPSESSID')
#print(repr(cookies))
def test_length_submit(guess):
usercode = 'int guess = ' + str(guess) + ';rn' + template_input_length
return test_submit(usercode)
def find_input_length():
print('GUESS', 'RESULT')
guess = 1
left = 0
while True:
result = test_length_submit(guess)
print(guess, result)
if result == 'lt':
break
else:
left = guess
guess = guess * 2
print('LEFT', 'RIGHT', 'GUESS', 'RESULT')
right = guess
while left <= right:
guess = (left + right) // 2
result = test_length_submit(guess)
print(left, right, guess, result)
if result == 'eq':
print('length =', guess)
break
elif result == 'gt':
left = guess + 1
elif result == 'lt':
right = guess - 1
return guess

def test_submit(usercode):
global cookies
headers = {'referer':'http://acm.hdu.edu.cn/showproblem.php?pid=' + str(pid), 'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0'}
r = requests.get(submit_url, params={'pid':pid}, allow_redirects=False, cookies=cookies, headers=headers)
if r.cookies.get('PHPSESSID') is not None:
cookies['PHPSESSID'] = r.cookies.get('PHPSESSID')
#print(repr(cookies))
if r.status_code == 302:
# need login
login(r.url)
r = requests.post(submit_url, params={'action':'submit'}, data={'check':0, 'problemid':pid, 'language':language['C++']-1, 'usercode':usercode}, allow_redirects=False, cookies = cookies, headers=headers)
if r.cookies.get('PHPSESSID') is not None:
cookies['PHPSESSID'] = r.cookies.get('PHPSESSID')
#print(repr(cookies))
# if r.status_code != 302:
# raise Exception(r)
# else:
global previous_runid
while True:
time.sleep(1)
r = requests.get(status_url, params={'user':username, 'lang':0, 'status':0}, headers=headers,cookies=cookies)
text = r.text
try:
index = text.index('tr align=center')
except ValueError:
# server is busy so that return something else.
time.sleep(5)
continue
index = text.index('px>', index)
index += len('px>')
endIndex = text.index('</td>', index)
runid = text[index:endIndex]
if runid == previous_runid: # submit failed
return test_submit(usercode)
index = text.index('green>', index)
index = index + len('green>')
endIndex = text.index('</font>', index)
status = text[index: endIndex]
if 'Queuing' in status:
continue
elif 'Compiling' in status:
continue
elif 'Running' in status:
continue
elif less in status:
previous_runid = runid
return 'lt'
elif greater in status:
previous_runid = runid
return 'gt'
elif equal in status:
previous_runid = runid
return 'eq'
else:
raise Exception('unknow judge status: ' + status)
def test_data_submit(skip, guess):
usercode = 'int guess = ' + str(guess) + ';rn'
usercode = usercode + 'int skip = ' + str(skip) + ';rn'
usercode = usercode + template_input_data
return test_submit(usercode)
def find_input_data():
length = find_input_length()
print('LEFT', 'RIGHT', 'SKIP', 'GUESS', 'RESULT')
for skip in range(length):
left = 0
right = 255
while left <= right:
guess = (left + right) // 2
result = test_data_submit(skip, guess)
print(left, right, skip, guess, result)
if result == 'eq':
file_data.append(guess)
print(repr(file_data))
break
elif result == 'lt':
right = guess -1
elif result == 'gt':
left = guess + 1
if left > right:
file_data.append('UNKNOWN')
return file_data
if len(sys.argv) == 1:
print('usage:ndiginput.py <pid> <username> <userpass>')
sys.exit(1)
else:
pid = int(sys.argv[1])
username = sys.argv[2]
userpass = sys.argv[3]
find_input_data()

代码的思路也很清楚,就是对每一个字母用二分查找爬出他的值。

执行一下:
myths@myths-X450LD:~/Desktop$ python hdu.py 3337 <用户名> <密码>
等个十几分钟,大概系统提交了100多发,得到了输出结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
('GUESS', 'RESULT')
(1, 'gt')
(2, 'gt')
(4, 'gt')
(8, 'gt')
(16, 'lt')
('LEFT', 'RIGHT', 'GUESS', 'RESULT')
(8, 16, 12, 'gt')
(13, 16, 14, 'eq')
('length =', 14)
('LEFT', 'RIGHT', 'SKIP', 'GUESS', 'RESULT')
(0, 255, 0, 127, 'lt')
(0, 126, 0, 63, 'gt')
(64, 126, 0, 95, 'lt')
(64, 94, 0, 79, 'gt')
(80, 94, 0, 87, 'lt')
(80, 86, 0, 83, 'gt')
(84, 86, 0, 85, 'lt')
(84, 84, 0, 84, 'eq')
[84]
(0, 255, 1, 127, 'lt')
(0, 126, 1, 63, 'gt')
(64, 126, 1, 95, 'gt')
(96, 126, 1, 111, 'lt')
(96, 110, 1, 103, 'gt')
(104, 110, 1, 107, 'lt')
(104, 106, 1, 105, 'lt')
(104, 104, 1, 104, 'eq')
[84, 104]
(0, 255, 2, 127, 'lt')
(0, 126, 2, 63, 'gt')
(64, 126, 2, 95, 'gt')
(96, 126, 2, 111, 'lt')
(96, 110, 2, 103, 'lt')
(96, 102, 2, 99, 'gt')
(100, 102, 2, 101, 'eq')
[84, 104, 101]
(0, 255, 3, 127, 'lt')
(0, 126, 3, 63, 'lt')
(0, 62, 3, 31, 'gt')
(32, 62, 3, 47, 'lt')
(32, 46, 3, 39, 'lt')
(32, 38, 3, 35, 'lt')
(32, 34, 3, 33, 'lt')
(32, 32, 3, 32, 'eq')
[84, 104, 101, 32]
(0, 255, 4, 127, 'lt')
(0, 126, 4, 63, 'gt')
(64, 126, 4, 95, 'lt')
(64, 94, 4, 79, 'lt')
(64, 78, 4, 71, 'lt')
(64, 70, 4, 67, 'lt')
(64, 66, 4, 65, 'eq')
[84, 104, 101, 32, 65]
(0, 255, 5, 127, 'lt')
(0, 126, 5, 63, 'gt')
(64, 126, 5, 95, 'lt')
(64, 94, 5, 79, 'gt')
(80, 94, 5, 87, 'lt')
(80, 86, 5, 83, 'eq')
[84, 104, 101, 32, 65, 83]
(0, 255, 6, 127, 'lt')
(0, 126, 6, 63, 'gt')
(64, 126, 6, 95, 'lt')
(64, 94, 6, 79, 'lt')
(64, 78, 6, 71, 'lt')
(64, 70, 6, 67, 'eq')
[84, 104, 101, 32, 65, 83, 67]
(0, 255, 7, 127, 'lt')
(0, 126, 7, 63, 'gt')
(64, 126, 7, 95, 'lt')
(64, 94, 7, 79, 'lt')
(64, 78, 7, 71, 'gt')
(72, 78, 7, 75, 'lt')
(72, 74, 7, 73, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73]
(0, 255, 8, 127, 'lt')
(0, 126, 8, 63, 'gt')
(64, 126, 8, 95, 'lt')
(64, 94, 8, 79, 'lt')
(64, 78, 8, 71, 'gt')
(72, 78, 8, 75, 'lt')
(72, 74, 8, 73, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73]
(0, 255, 9, 127, 'lt')
(0, 126, 9, 63, 'lt')
(0, 62, 9, 31, 'gt')
(32, 62, 9, 47, 'lt')
(32, 46, 9, 39, 'lt')
(32, 38, 9, 35, 'lt')
(32, 34, 9, 33, 'lt')
(32, 32, 9, 32, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32]
(0, 255, 10, 127, 'lt')
(0, 126, 10, 63, 'gt')
(64, 126, 10, 95, 'gt')
(96, 126, 10, 111, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32, 111]
(0, 255, 11, 127, 'lt')
(0, 126, 11, 63, 'gt')
(64, 126, 11, 95, 'gt')
(96, 126, 11, 111, 'lt')
(96, 110, 11, 103, 'lt')
(96, 102, 11, 99, 'gt')
(100, 102, 11, 101, 'gt')
(102, 102, 11, 102, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32, 111, 102]
(0, 255, 12, 127, 'lt')
(0, 126, 12, 63, 'lt')
(0, 62, 12, 31, 'gt')
(32, 62, 12, 47, 'lt')
(32, 46, 12, 39, 'lt')
(32, 38, 12, 35, 'lt')
(32, 34, 12, 33, 'lt')
(32, 32, 12, 32, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32, 111, 102, 32]
(0, 255, 13, 127, 'lt')
(0, 126, 13, 63, 'gt')
(64, 126, 13, 95, 'lt')
(64, 94, 13, 79, 'lt')
(64, 78, 13, 71, 'gt')
(72, 78, 13, 75, 'lt')
(72, 74, 13, 73, 'lt')
(72, 72, 13, 72, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32, 111, 102, 32, 72]

最后的一串数字转换为ASCII码就是“The ASCII of H”了。