-
Notifications
You must be signed in to change notification settings - Fork 61
/
apiFinder.py
executable file
·138 lines (111 loc) · 4.21 KB
/
apiFinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import urllib
from urllib.parse import urlparse, parse_qs
import sys
import re
import os
from random import shuffle
from bs4 import BeautifulSoup
from apicall import APICall, APICallEncoder, APIWriter
from harParser import HarParser
from browser import Browser
class APIFinder:
def __init__(self, url=None, harDirectory=None, searchString=None, removeParams=False, count=1, cookies=None):
self.url = url
self.harDirectory = harDirectory
self.searchString = searchString
self.removeParams = removeParams
self.count = count
self.browser = None
self.cookies = cookies
def start(self):
if self.count > 1 and self.url is None:
print("Cannot provide page count with no URL given")
exit(1)
if self.removeParams and self.url is None:
print("WARNING: Must have Internet connection to remove unneeded parameters")
#Scan for all APIs
if self.url:
os.makedirs(self.harDirectory,exist_ok=True)
self.deleteExistingHars()
self.browser = Browser("chromedriver/chromedriver", "browsermob-proxy-2.1.4/bin/browsermob-proxy", self.harDirectory, cookies=self.cookies)
if self.searchString is not None:
print("Searching URL "+self.url+" for string "+self.searchString)
#Move recursively through the site
apiCalls = self.crawlingScan(self.url)
#Scan directory of har files
else:
print("Parsing existing directory of har files")
harParser = HarParser(self.harDirectory, self.searchString, self.removeParams)
apiCalls = harParser.parseMultipleHars()
if self.browser is not None:
self.browser.close()
return apiCalls
def openURL(self, url):
return self.browser.get(url) #load the url in Chrome
def getDomain(self, url):
return urlparse(url).netloc.lstrip('www.')
def isInternal(self, url, baseUrl):
if url.startswith("/"):
return baseUrl+url
if self.getDomain(baseUrl) == self.getDomain(url):
return url
return None
def findInternalURLsInText(self, text, currentUrl, allFoundURLs):
newUrls = []
regex = re.compile(r'(https?://[\w]+\.)(com|org|biz|net)((/[\w]+)+)(\.[a-z]{2,4})?(\?[\w]+=[\w]+)?((&[\w]+=[\w]+)+)?', re.ASCII)
matches = re.finditer(regex, text)
for match in matches:
print(str(match.group()))
#Returns a list of all internal URLs on a page as long
#as they are either relative URLs or contain the current domain name
def findInternalURLs(self, bsObj, currentUrl, allFoundURLs):
newUrls = []
baseUrl = urlparse(currentUrl).scheme+"://"+urlparse(currentUrl).netloc
#Finds all links that begin with a "/"
for link in bsObj.findAll("a"):
if 'href' in link.attrs:
#baseUrl, urlInPage = parseUrl(link.attrs)
url = link.attrs['href']
#It's an internal URL and we haven't found it already
url = self.isInternal(url, baseUrl)
if url is not None and url not in newUrls and url not in allFoundURLs:
newUrls.append(url)
allFoundURLs.append(url)
return allFoundURLs, newUrls
def getContentType(self,headers):
for header in headers:
if header["name"] == "Content-Type":
return header["value"]
#Get rid of all the current har files
def deleteExistingHars(self):
files = os.listdir(self.harDirectory)
for singleFile in files:
if "har" in singleFile:
os.remove(self.harDirectory+"/"+singleFile)
#Performs a recursive crawl of a site, searching for APIs
def crawlingScan(self, url, apiCalls = [], allFoundURLs = []):
self.count = self.count - 1
if self.count < 0:
return
harParser = HarParser(self.harDirectory, searchString=self.searchString, removeParams=self.removeParams)
#If uncommented, will return as soon as a matching call is found
#if self.searchString is not None and len(apiCalls) > 0:
# return apiCalls
try:
print("Scanning URL: "+url)
html = self.openURL(url)
if html is not None:
bsObj = BeautifulSoup(html, "lxml")
harObj = harParser.getSingleHarFile()
apiCalls = harParser.scanHarfile(harObj, apiCalls=apiCalls)
allFoundURLs, newUrls = self.findInternalURLs(bsObj, url, allFoundURLs)
shuffle(newUrls)
for newUrl in newUrls:
self.crawlingScan(newUrl, apiCalls, allFoundURLs)
except (KeyboardInterrupt, SystemExit):
print("Stopping crawl")
self.browser.close()
apiWriter = APIWriter(apiCalls)
apiWriter.outputAPIs()
exit(1)
return apiCalls