-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlog-main.py
158 lines (143 loc) · 5.86 KB
/
log-main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import codecs
import os
import urllib
import gzip
import argparse
import gzip
import GeoIP
from jinja2 import Environment, FileSystemLoader
PROJECT_ROOT = os.path.dirname(__file__)
env = Environment(
loader=FileSystemLoader(os.path.join(os.path.dirname(__file__), "templates")),
trim_blocks=True)
#env = Environment(loader=FileSystemLoader(os.path.dirname(__file__)),trim_blocks=True)
#print env.get_template("report.html").render(variavle="blah")
gi = GeoIP.open("GeoIP.dat", GeoIP.GEOIP_MEMORY_CACHE)
parser = argparse.ArgumentParser(description='Apache2 log parser.')
parser.add_argument('--output',
help="This is where we place the output files such as report.html and map.svg",
default='build')
parser.add_argument('--path', help='Path to Apache2 log files', default="/home/malyhass/log-parser")
parser.add_argument('--top-urls', help="Find top URL-s", action='store_true')
parser.add_argument('--geoip', help ="Resolve IP-s to country codes", default="/home/malyhass/GeoIP.dat")
parser.add_argument('--verbose', help="Increase verbosity", action="store_true")
args = parser.parse_args()
#this is the directory where is the log files locate,
#root = "/home/malyhass" if you working on the server you can use it without the path
keywords = "Windows", "Linux", "OS X", "Ubuntu", "Googlebot", "bingbot", "Android", "YandexBot", "facebookexternalhit"
d = {}
urls = {}
total = 0
files = []
users = {}
countries = {}
ip_addresses = {}
user_bytes = {}
if not os.path.exists(args.output):
os.makedirs(args.output)
for filename in os.listdir(args.path):
if not filename.startswith("access.log"):
print "Skipping unknown file:", filename
continue
if filename.endswith(".gz"):
continue
fh = gzip.open(os.path.join(args.path, filename))
else:
fh = open(os.path.join(args.path, filename))
if args.verbose:
print "Parsing:", filename
for line in fh:
total = total + 1
try:
source_timestamp, request, response, referrer, _, agent, _ = line.split("\"")
method, path, protocol = request.split(" ")
except ValueError:
continue # Skip garbage
source_ip , _, _, timestamp = source_timestamp.split(" ", 3)
if not ":" in source_ip:
ip_addresses[source_ip] = ip_addresses.get(source_ip, 0) + 1
cc = gi.country_code_by_addr(source_ip)
countries[cc] = countries.get(cc, 0) + 1
if path == "*": continue # Skip asterisk for path
_, status_code, content_length, _ = response.split(" ")
content_length = int(content_length)
path = urllib.unquote(path)
if path.startswith("/~"):
username = path[2:].split("/")[0]
try:
user_bytes[username] = user_bytes[username] + content_length
except:
user_bytes[username] = content_length
try:
urls[path] = urls[path] + 1
except:
urls[path] = 1
for keyword in keywords:
if keyword in agent:
try:
d[keyword] = d[keyword] + 1
except KeyError:
d[keyword] = 1
break
if not urls:
print("No log files!")
exit(255)
from datetime import datetime
def humanize(bytes):
if bytes < 1024:
return "%d B" % bytes
elif bytes < 1024 ** 2:
return "%.1f KB" % (bytes / 1024.0)
elif bytes < 1024 ** 3:
return "%.1f MB" % (bytes / 1024.0 ** 2)
else:
return "%.1f GB" % (bytes / 1024.0 ** 3)
from lxml import etree
from lxml.cssselect import CSSSelector
#document = etree.parse(open('templates/map.svg'))
document = etree.parse(open(os.path.join(PROJECT_ROOT, 'templates', 'map.svg')))
max_hits = max(countries.values())
print("country with max amount of hits:", max_hits)
for country_code, hits in countries.items():
if not country_code: continue
print country_code, hex(hits * 255 / max_hits) [2:]
sel = CSSSelector("#" + country_code.lower())
for j in sel(document):
j.set("style", "fill:#" + hex(hits * 255 / max_hits)[2:] + "0000")
for i in j.iterfind("{http://www.w3.org/2000/svg}path"):
i.attrib.pop("class", "")
#user_bytes = sorted(user_bytes.items(), key = lambda item:item[1], reverse=True)
context = {
"humanize" : humanize,
"url_hits" : sorted(urls.items(), key = lambda i:i[1], reverse= True),
"user_bytes":sorted(user_bytes.items(), key = lambda item:item[1], reverse=True)
}
#with codecs.open("output.html", "w", encoding="utf-8") as fh:
# fh.write(env.get_template("report.html").render(context))
with codecs.open(os.path.join(args.output, "report.html"), "w", encoding="utf-8") as fh:
fh.write(env.get_template("report.html").render(context))
os.system("x-www-browser file://" + os.path.realpath("build/report.html") + " &")
with open(os.path.join(args.output, "map.svg"), "w") as fh:
fh.write(etree.tostring(document))
for filename in os.listdir("."):
mode, inode, device, nlink, vid, gid, size, atime, mtime, ctime = os.stat(filename)
files.append((filename, datetime.fromtimestamp(mtime), size))
files.sort(key = lambda(filename, dt, size):dt)
print "Newest file is:", files[-1][0]
print "Oldesr file is:", files[0][0]
for filename, dt, size in files:
print filename, dt, humanize(size)
print"************************"
print("Top IP-addresses:")
results = ip_addresses.items()
results.sort(key = lambda item:item[1], reverse=True)
for ip, hits in results[:5]:
print ip, "==>", hits, "(", hits * 100 / total, "%)"
print "************************"
print "Total lines =", total
print "***********************"
result = urls.items()
result.sort(key = lambda item:item[1], reverse=True)
for keyword, hits in result[:5]:
print keyword, "==>", hits, "(", hits * 100 / total, "%)"
print "************************"