# scope_example.py
from nyawc.Options import Options
from nyawc.Crawler import Crawler
from nyawc.http.Request import Request
options = Options()
options.scope.protocol_must_match = False
options.scope.subdomain_must_match = True
options.scope.hostname_must_match = True
options.scope.tld_must_match = True
options.scope.max_depth = None
options.scope.request_methods = [
Request.METHOD_GET,
Request.METHOD_POST,
Request.METHOD_PUT,
Request.METHOD_DELETE,
Request.METHOD_OPTIONS,
Request.METHOD_HEAD
]
crawler = Crawler(options)
crawler.start_with(Request("https://finnwea.com/"))
Only crawl pages with the same protocol as the startpoint (e.g. only https) if True. Default is False.
options.scope.protocol_must_match = False
Only crawl pages with the same subdomain as the startpoint if True. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
Please note that the www subdomain will be treated the same as no subdomain.
options.scope.subdomain_must_match = True
Only crawl pages with the same hostname as the startpoint (e.g. only finnwea) if True. Default is True.
Please note that if you set this to false, chances are that it never stops crawling.
options.scope.hostname_must_match = True
Only crawl pages with the same tld as the startpoint (e.g. only .com) if True. Default is True.
options.scope.tld_must_match = True
The maximum search depth. Default is None (unlimited).
options.scope.max_depth = None
Only crawl these request methods. If empty or None
all request methods will be crawled. Default is all.
options.scope.request_methods = [
Request.METHOD_GET,
Request.METHOD_POST,
Request.METHOD_PUT,
Request.METHOD_DELETE,
Request.METHOD_OPTIONS,
Request.METHOD_HEAD
]