-
Notifications
You must be signed in to change notification settings - Fork 0
/
package_cache.py
executable file
·160 lines (119 loc) · 4.98 KB
/
package_cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
# 1st-party
import calendar
from datetime import datetime
import json
import logging
import os
import time
import urllib.request as request
import xmlrpc.client as xmlrpclib
# 3rd-party
# sudo apt-get install python3-bs4
from bs4 import BeautifulSoup
def get_last_timestamp_before_compromise(timestamps, compromise_timestamp):
last_timestamp_before_compromise = None
# Walk timestamp in increasing order of time.
for timestamp in sorted(timestamps):
# This timestamp is still before the compromise, so keep it instead.
if timestamp < compromise_timestamp:
last_timestamp_before_compromise = timestamp
# Aha, we are now after the compromise. Get out of here.
else:
break
# Return either the last known timestamp before compromise, or None if no
# timestamp before compromise was observed.
return last_timestamp_before_compromise
# simple class to store redirections locally, should be initialized from
# a previous file and it will store the redirections in a file.
class pypi_database_builder:
def __init__(self, filename, rebuild_cache=False):
# For every project, we get list of timestamps (sorted in increasing order)
# that the project added/updated/removed some package.
# {
# "Django": [timestamp("May 17 2010"), ..., timestamp("Oct 22 2014")]
# }
if os.path.exists(filename):
with open(filename, 'rt') as fp:
self.project_to_package_timestamps = json.load(fp)
else:
self.project_to_package_timestamps = {}
self.filename = filename
self.rebuild_cache = rebuild_cache
self.throttle_time = 100
# package_date (e.g. "May 23, 2014") is the date that this project last
# added, updated or removed a package.
def get_timestamp(self, package_date):
# Parse the date as a time.struct_time tuple.
package_timestruct = time.strptime(package_date, '%b %d, %Y')
# Turn the time.struct_time tuple into a POSIX timestamp.
package_timestamp = calendar.timegm(package_timestruct)
return package_timestamp
def get_timestamps(self, soup):
all_versions = soup.select('#all-versions')
if len(all_versions) > 0:
assert len(all_versions) == 1
all_versions = all_versions[0]
spans = all_versions.select('span.text-muted')
else:
metadata_div = soup.select('div.metadata')
assert len(metadata_div) == 1
metadata_div = metadata_div[0]
metadata_terms = metadata_div.find_all('dt')
for metadata_term in metadata_terms:
if metadata_term.string == 'Versions':
versions = metadata_term
break
spans = versions.next_sibling.next_sibling.ul.select('span.text-muted')
dates = {span.string for span in spans}
timestamps = sorted(self.get_timestamp(date) for date in dates)
return timestamps
def build(self):
projects = xmlrpclib.ServerProxy('https://pypi.python.org/pypi')\
.list_packages()
failure_counter = 0
success_counter = 0
for project in projects:
if self.rebuild_cache or project not in self.project_to_package_timestamps:
try:
url = 'https://warehouse.python.org/project/{}/'.format(project)
soup = BeautifulSoup(request.urlopen(url))
timestamps = self.get_timestamps(soup)
except:
logging.exception('Missed project: {}'.format(project))
failure_counter += 1
else:
self.project_to_package_timestamps[project] = timestamps
logging.info('Found project: {}'.format(project))
success_counter += 1
finally:
counter = failure_counter+success_counter
if counter % self.throttle_time == 0:
progress_rate = (counter/len(projects))*100
logging.debug('Sleeping for 5 seconds... ({}% complete)'\
.format(progress_rate))
self.dump()
time.sleep(5)
counter = failure_counter+success_counter
assert counter == len(projects)
failure_percentage = (failure_counter/counter)*100
failure_message = 'Missed {} ({}%) projects'.format(failure_counter,
failure_percentage)
assert failure_percentage < 1, failure_message
logging.info(failure_message)
self.dump()
def dump(self):
with open(self.filename, 'wt') as fp:
json.dump(self.project_to_package_timestamps, fp, sort_keys=True,
indent=4, separators=(',', ': '))
if __name__ == '__main__':
# rw for owner and group but not others
os.umask(0o07)
logging.basicConfig(filename='/var/experiments-output/package_cache.log',
level=logging.DEBUG, filemode='w',
format='[%(asctime)s UTC] [%(name)s] [%(levelname)s] '\
'[%(funcName)s:%(lineno)s@%(filename)s] '\
'%(message)s')
cache = pypi_database_builder('/var/experiments-output/package_cache.json',
rebuild_cache=True)
cache.build()