blob: c58565e39451461f03dc7d9fb652554c96615f00 [file] [log] [blame]
Olivier Deprezf4ef2d02021-04-20 13:36:24 +02001""" robotparser.py
2
3 Copyright (C) 2000 Bastian Kleineidam
4
5 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PSF license for Python 2.2
8
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://www.robotstxt.org/norobots-rfc.txt
11"""
12
13import collections
14import urllib.parse
15import urllib.request
16
17__all__ = ["RobotFileParser"]
18
19RequestRate = collections.namedtuple("RequestRate", "requests seconds")
20
21
22class RobotFileParser:
23 """ This class provides a set of methods to read, parse and answer
24 questions about a single robots.txt file.
25
26 """
27
28 def __init__(self, url=''):
29 self.entries = []
30 self.sitemaps = []
31 self.default_entry = None
32 self.disallow_all = False
33 self.allow_all = False
34 self.set_url(url)
35 self.last_checked = 0
36
37 def mtime(self):
38 """Returns the time the robots.txt file was last fetched.
39
40 This is useful for long-running web spiders that need to
41 check for new robots.txt files periodically.
42
43 """
44 return self.last_checked
45
46 def modified(self):
47 """Sets the time the robots.txt file was last fetched to the
48 current time.
49
50 """
51 import time
52 self.last_checked = time.time()
53
54 def set_url(self, url):
55 """Sets the URL referring to a robots.txt file."""
56 self.url = url
57 self.host, self.path = urllib.parse.urlparse(url)[1:3]
58
59 def read(self):
60 """Reads the robots.txt URL and feeds it to the parser."""
61 try:
62 f = urllib.request.urlopen(self.url)
63 except urllib.error.HTTPError as err:
64 if err.code in (401, 403):
65 self.disallow_all = True
66 elif err.code >= 400 and err.code < 500:
67 self.allow_all = True
68 else:
69 raw = f.read()
70 self.parse(raw.decode("utf-8").splitlines())
71
72 def _add_entry(self, entry):
73 if "*" in entry.useragents:
74 # the default entry is considered last
75 if self.default_entry is None:
76 # the first default entry wins
77 self.default_entry = entry
78 else:
79 self.entries.append(entry)
80
81 def parse(self, lines):
82 """Parse the input lines from a robots.txt file.
83
84 We allow that a user-agent: line is not preceded by
85 one or more blank lines.
86 """
87 # states:
88 # 0: start state
89 # 1: saw user-agent line
90 # 2: saw an allow or disallow line
91 state = 0
92 entry = Entry()
93
94 self.modified()
95 for line in lines:
96 if not line:
97 if state == 1:
98 entry = Entry()
99 state = 0
100 elif state == 2:
101 self._add_entry(entry)
102 entry = Entry()
103 state = 0
104 # remove optional comment and strip line
105 i = line.find('#')
106 if i >= 0:
107 line = line[:i]
108 line = line.strip()
109 if not line:
110 continue
111 line = line.split(':', 1)
112 if len(line) == 2:
113 line[0] = line[0].strip().lower()
114 line[1] = urllib.parse.unquote(line[1].strip())
115 if line[0] == "user-agent":
116 if state == 2:
117 self._add_entry(entry)
118 entry = Entry()
119 entry.useragents.append(line[1])
120 state = 1
121 elif line[0] == "disallow":
122 if state != 0:
123 entry.rulelines.append(RuleLine(line[1], False))
124 state = 2
125 elif line[0] == "allow":
126 if state != 0:
127 entry.rulelines.append(RuleLine(line[1], True))
128 state = 2
129 elif line[0] == "crawl-delay":
130 if state != 0:
131 # before trying to convert to int we need to make
132 # sure that robots.txt has valid syntax otherwise
133 # it will crash
134 if line[1].strip().isdigit():
135 entry.delay = int(line[1])
136 state = 2
137 elif line[0] == "request-rate":
138 if state != 0:
139 numbers = line[1].split('/')
140 # check if all values are sane
141 if (len(numbers) == 2 and numbers[0].strip().isdigit()
142 and numbers[1].strip().isdigit()):
143 entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
144 state = 2
145 elif line[0] == "sitemap":
146 # According to http://www.sitemaps.org/protocol.html
147 # "This directive is independent of the user-agent line,
148 # so it doesn't matter where you place it in your file."
149 # Therefore we do not change the state of the parser.
150 self.sitemaps.append(line[1])
151 if state == 2:
152 self._add_entry(entry)
153
154 def can_fetch(self, useragent, url):
155 """using the parsed robots.txt decide if useragent can fetch url"""
156 if self.disallow_all:
157 return False
158 if self.allow_all:
159 return True
160 # Until the robots.txt file has been read or found not
161 # to exist, we must assume that no url is allowable.
162 # This prevents false positives when a user erroneously
163 # calls can_fetch() before calling read().
164 if not self.last_checked:
165 return False
166 # search for given user agent matches
167 # the first match counts
168 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
169 url = urllib.parse.urlunparse(('','',parsed_url.path,
170 parsed_url.params,parsed_url.query, parsed_url.fragment))
171 url = urllib.parse.quote(url)
172 if not url:
173 url = "/"
174 for entry in self.entries:
175 if entry.applies_to(useragent):
176 return entry.allowance(url)
177 # try the default entry last
178 if self.default_entry:
179 return self.default_entry.allowance(url)
180 # agent not found ==> access granted
181 return True
182
183 def crawl_delay(self, useragent):
184 if not self.mtime():
185 return None
186 for entry in self.entries:
187 if entry.applies_to(useragent):
188 return entry.delay
189 if self.default_entry:
190 return self.default_entry.delay
191 return None
192
193 def request_rate(self, useragent):
194 if not self.mtime():
195 return None
196 for entry in self.entries:
197 if entry.applies_to(useragent):
198 return entry.req_rate
199 if self.default_entry:
200 return self.default_entry.req_rate
201 return None
202
203 def site_maps(self):
204 if not self.sitemaps:
205 return None
206 return self.sitemaps
207
208 def __str__(self):
209 entries = self.entries
210 if self.default_entry is not None:
211 entries = entries + [self.default_entry]
212 return '\n\n'.join(map(str, entries))
213
214
215class RuleLine:
216 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
217 (allowance==False) followed by a path."""
218 def __init__(self, path, allowance):
219 if path == '' and not allowance:
220 # an empty value means allow all
221 allowance = True
222 path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
223 self.path = urllib.parse.quote(path)
224 self.allowance = allowance
225
226 def applies_to(self, filename):
227 return self.path == "*" or filename.startswith(self.path)
228
229 def __str__(self):
230 return ("Allow" if self.allowance else "Disallow") + ": " + self.path
231
232
233class Entry:
234 """An entry has one or more user-agents and zero or more rulelines"""
235 def __init__(self):
236 self.useragents = []
237 self.rulelines = []
238 self.delay = None
239 self.req_rate = None
240
241 def __str__(self):
242 ret = []
243 for agent in self.useragents:
244 ret.append(f"User-agent: {agent}")
245 if self.delay is not None:
246 ret.append(f"Crawl-delay: {self.delay}")
247 if self.req_rate is not None:
248 rate = self.req_rate
249 ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
250 ret.extend(map(str, self.rulelines))
251 return '\n'.join(ret)
252
253 def applies_to(self, useragent):
254 """check if this entry applies to the specified agent"""
255 # split the name token and make it lower case
256 useragent = useragent.split("/")[0].lower()
257 for agent in self.useragents:
258 if agent == '*':
259 # we have the catch-all agent
260 return True
261 agent = agent.lower()
262 if agent in useragent:
263 return True
264 return False
265
266 def allowance(self, filename):
267 """Preconditions:
268 - our agent applies to this entry
269 - filename is URL decoded"""
270 for line in self.rulelines:
271 if line.applies_to(filename):
272 return line.allowance
273 return True