-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
303 lines (235 loc) · 16.6 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import os
from pathlib import Path
import re
import shutil
# ####################################################################################################
# some alignment vars
filenameDefinitionLength = 0x02
unknownDefinition1Length = 0x02
null2Length = 0x02
unknownDefinition2Length = 0x04
null1Length = 0x01
continueReadingLength = 0x01
#null2Length = 0x02
numberOfDirectoryNamesLength = 0x01
null3Length = 0x03
#filenameDefinitionLength = 0x02
unknownDefinition3Length = 0x02
#null2Length = 0x02
unknownDefinition4Length = 0x04
#null1Length = 0x01
#continueReadingLength = 0x01
#null2Length = 0x02
fileNameLengthLength = 0x01
#null3Length = 0x03
#fileNameLength = 0x??
unknownDefinition5Length = 0x02
unknownDefinition6Length = 0x02
null4Length = 0x04
fileTypeLength = 0x01
# this to help convert a byte to a bool (b'\x00' = 0, else = 1)
byteCompareForBool = b'\x00'
# when reading the file type, these are the types ive observed
fileTypeMappingFile = b'\x05'
fileTypeMappingDir = b'\x04'
# ####################################################################################################
# walk through the filesystem
#testFileAttrName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/attr/_parent'
#testFileDataName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/data'
testFileAttrName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/attr/_parent'
testFileDataName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/data'
# fuse mounted osd path
#testRoot = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#'
testRoot = '/mnt/test/5.7f_head/all/#5:fea38466:::100028f0fe1.00000000:head#'
fuseRoot = '/mnt/test'
# destination dirs
mntDir = '/mnt'
destRoot = 'ceph-fs-storage'
# exclusions. mostly to exclude the metadata dir
exclusionDirs = ('/mnt/test/meta')
# for selecting the folder and grabbing relative files
relativeFolderStructureDir = '/attr'
folderStructureFile = '_parent'
dataFilename = 'data'
# placement group indicator extractor
placementGroupNameRegex = '(#.*?):+(.*?):+(.*?)\.(.*?):+(.*?)#$'
# g1 g2 g3 g4 g5
# determine number of paths to loop
#fuseRootDirs = os.walk(testRoot)
fuseRootDirs = os.walk(fuseRoot)
# walk through the fuse-mounted OSD
#for fullPaths in fuseRootDirs:
for fullPaths, _, _ in fuseRootDirs:
# dont walk into excluded dirs
if not exclusionDirs in fullPaths:
# only walk into dirs which are attr dir
if fullPaths.endswith(relativeFolderStructureDir):
# at this point we've got the dirs we want
# now we can walk up 1 dir for the (assumed) placement group's main dir
pgMainDir = os.path.normpath(os.path.dirname(fullPaths))
# join up the main dir with the folder structure file
pgFolderStructureFile = os.path.normpath(os.path.join(pgMainDir, relativeFolderStructureDir[1:], folderStructureFile))
# extract the file's unique reference
pgDataFileRegexSearch = re.search(placementGroupNameRegex, Path(pgFolderStructureFile).parent.parent.name)
# only proceed if a match has been made. this means that there _is_ a data file to grab
if pgDataFileRegexSearch:
pgDataFileUniqueIndicator = pgDataFileRegexSearch.group(3)
pgDataFileChunk = int(pgDataFileRegexSearch.group(4), 16)
# only proceed if the folder structure file exists
if os.path.exists(pgFolderStructureFile):
# join up the main dir with the data file
pgDataFile = os.path.normpath(os.path.join(pgMainDir, dataFilename))
# only proceed if the data file exists
if os.path.exists(pgDataFile):
# ####################################################################################################
# running the loop to scrape the file/folder structure
# empty list for saving the path details
filePathInformation = []
# open the file readonly as a binary file
with open(pgFolderStructureFile, mode='rb') as file:
# get EOF seek address - seek to 0 bytes from the end of file (2)
file.seek(0, 2)
eofAddress = file.tell()
# return to beginning of file - seek to 0 bytes from the beginning of file (0)
file.seek(0, 0)
# _parent file header
startDefinition = file.read(filenameDefinitionLength)
#print(startDefinition.hex(' '))
#todo: ensure that this value is 0x05 0x04
# unknown block 1
unknownDefinition1 = file.read(unknownDefinition1Length)
#print(unknownDefinition1.hex(' '))
# null with length 2
file.read(null2Length)
# unknown block 2
unknownDefinition2 = file.read(unknownDefinition2Length)
#print(unknownDefinition2.hex(' '))
# null with length 1
file.read(null1Length)
# continue reading byte
continueReading = file.read(continueReadingLength) != byteCompareForBool
#print(continueReading)
#todo: it should always be 0x01 here as we are in the header
# null with length 2
file.read(null2Length)
# number of directory names in path
numberOfDirectoryNames = file.read(numberOfDirectoryNamesLength)
#print(numberOfDirectoryNames.hex(' '))
# null with length 3
file.read(null3Length)
# seek through the binary file until our seek cursor is at the end of the file
#while(file.tell() < eofAddress):
while(file.tell() < eofAddress and continueReading):
# file header
startFileDefinition = file.read(filenameDefinitionLength)
#print(startFileDefinition.hex(' '))
#todo: ensure that this value is 0x02 0x02
# unknown block 3
unknownDefinition3 = file.read(unknownDefinition3Length)
#print(unknownDefinition3.hex(' '))
# null with length 2
file.read(null2Length)
# unknown block 4
unknownDefinition4 = file.read(unknownDefinition4Length)
#print(unknownDefinition4.hex(' '))
# null with length 1
file.read(null1Length)
# continue reading byte
continueReading = file.read(continueReadingLength) != byteCompareForBool
#print(continueReading)
# this should stop the loop after reading the last file path
# null with length 2
file.read(null2Length)
# length of the filename
fileNameLength = file.read(fileNameLengthLength)
#print(fileNameLength.hex(' '))
# null with length 3
file.read(null3Length)
# this is the filename. read length is based on above length of filename
fileName = file.read(int.from_bytes(fileNameLength)).decode('utf-8')
#print(fileName)
# append the file name that we have captured to a list
if fileName:
filePathInformation.append(fileName)
# unknown block 5
unknownDefinition5 = file.read(unknownDefinition5Length)
#print(unknownDefinition5.hex(' '))
# unknown block 6
unknownDefinition6 = file.read(unknownDefinition6Length)
#print(unknownDefinition6.hex(' '))
# null with length 4
file.read(null4Length)
# after the loop, we have 1 final area to check and that is the item type
fileType = file.read(fileTypeLength)
#print(fileType.hex(' '))
# ####################################################################################################
# handling the filename with the data file
# add root dirs
filePathInformation.append(destRoot)
filePathInformation.append(mntDir)
# first reverse the list so that it is easier to create dir structure
filePathInformation.reverse()
# joins up all the dirs with the destination root dir. excludes filename
newDir = os.path.normpath(os.path.join(*filePathInformation[:-1]))
#print(newDir)
newFile = os.path.normpath(os.path.join(*filePathInformation))
#print(newFile)
# ####################################################################################################
# FILE RECOVERY
# if it is a directory, make that directory!
if fileType == fileTypeMappingDir:
if not os.path.exists(newFile):
print('new dir: ' + newFile)
os.makedirs(newFile)
# if it is a file, copy that file!
if fileType == fileTypeMappingFile:
if not os.path.exists(newFile):
#if os.path.exists(newFile):
if not os.path.exists(newDir):
print('new dir: ' + newDir)
os.makedirs(newDir)
print('old file: ' + pgDataFile)
print('new file: ' + newFile)
shutil.copyfile(pgDataFile, newFile)
#NEW BUG ALERT: files over 4096KB are TRUNCATED! likely means that the files were ripped into pieces
# RIP MY FILES INTO PIECES
# THIS IS MY LAST RESTORE
# if the file is 4MB, then we must also check if it has been chunked
if os.stat(pgDataFile).st_size == 4194304:
listOfPgDataFileChunkPaths = []
print('data file size is 4MB. checking if chunked...')
# search for the placement group's file identifier in other placement groups
#WARNING: this process is SLOW as i have to re-scan the whole dir
#for pgDataFileUniqueIndicatorFullPaths in fuseRootDirs:
for pgDataFileUniqueIndicatorFullPaths, _, _ in fuseRootDirs:
# only walk into dirs which contain the file's unique identifier
if pgDataFileUniqueIndicator in pgDataFileUniqueIndicatorFullPaths:
listOfPgDataFileChunkPaths.append(os.path.normpath(pgDataFileUniqueIndicatorFullPaths))
# count the number of directories found. logic:
# if the number of chunked files found is only 1, then it really _is_ a 4MB file
if len(listOfPgDataFileChunkPaths) == 1:
print('data file not chunked')
# if there are more than 1 dir found, then append the data to the already-written file (above)
if len(listOfPgDataFileChunkPaths) > 1:
print('chunked data file found!')
# iterate over the list of file chunk paths
for pgDataFileChunkPath in listOfPgDataFileChunkPaths:
# increase the chunk iterator
pgDataFileChunk += 1
# convert it to a hex string (for searching the directory names)
pgDataFileChunkString = (f'{pgDataFileChunk:0>8x}')
# loop through the list (again) and filter by current chunk
for pgDataFileChunkPath2 in listOfPgDataFileChunkPaths:
if pgDataFileChunkString in pgDataFileChunkPath2:
# this gets the main dir of the data chunk file
pgDataChunkMainDir = os.path.normpath(os.path.dirname(pgDataFileChunkPath2))
# this gets the path of the dat achunk file
pgDataChunkFile = os.path.normpath(os.path.join(pgDataChunkMainDir, dataFilename))
#print(pgDataChunkFile)
print(pgDataFileChunkPath2)
# read the file's chunk data file
with open(pgDataChunkFile, 'rb') as pgDataFileChunkDataFile_file:
# write to the file in append binary mode
with open(newFile, 'ab') as newFile_file:
#print('reading from: ' + pgDataChunkFile)
newFile_file.write(pgDataFileChunkDataFile_file.read())