-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathGenerateImageDataset.py
More file actions
110 lines (90 loc) · 4.33 KB
/
GenerateImageDataset.py
File metadata and controls
110 lines (90 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 24 16:04:53 2019
@author: ZhangXin
"""
import os
import torch
import json
import pandas as pd
from Statistics import histAndBoxPlot
def featureListToImageTensor(featureListJsonFilePath, lenOfSourceDict=17372, lenOfSinkDict=7784,
width=4, height=300, maxHeight=2000):
with open(featureListJsonFilePath, "r") as featureListJsonFile:
featureList = json.load(featureListJsonFile)
imageTensor = torch.zeros(maxHeight, width).double()
widthIndex, heightIndex = 0, 0
for community in featureList:
if widthIndex != 0:
widthIndex = 0
heightIndex += 1
for cluster in community:
if widthIndex != 0 and widthIndex + len(cluster) > width:
widthIndex = 0
heightIndex += 1
for nodeId in cluster:
pixel = getNormalizedPixel(nodeId, lenOfSourceDict, lenOfSinkDict)
imageTensor[heightIndex][widthIndex] = pixel
# imageTensor[heightIndex][widthIndex] = nodeId
# print("height: %s width: %s nodeId: %s" % (heightIndex, widthIndex, nodeId))
widthIndex += 1
if widthIndex >= width:
widthIndex = 0
heightIndex += 1
if widthIndex != 0:
rawHeight = heightIndex + 1
else:
rawHeight = heightIndex
# print(imageTensor[:rawHeight])
return imageTensor[:height], rawHeight
def getNormalizedPixel(nodeId, lenOfSourceDict, lenOfSinkDict):
if nodeId > 0:
return nodeId / lenOfSourceDict
else:
return nodeId / lenOfSinkDict
def generateImageDataset(apkDecompileDatesetDirPath, imageDatasetDirPath,
lenOfSourceDict=17372, lenOfSinkDict=7784, isMalware=True):
imageRawHeightList = []
if not os.path.isdir(imageDatasetDirPath):
os.makedirs(imageDatasetDirPath)
for apkDecompileDir in os.listdir(apkDecompileDatesetDirPath):
featureListJsonFilePath = os.path.join(apkDecompileDatesetDirPath,
apkDecompileDir, "featureList.json")
if os.path.isfile(featureListJsonFilePath):
imageTensor, imageRawHeight = featureListToImageTensor(featureListJsonFilePath,
lenOfSourceDict=17372, lenOfSinkDict=7784,
width=4, height=300, maxHeight=2000)
imageRawHeightList.append(imageRawHeight)
if isMalware:
imageTensorFilePath = os.path.join(imageDatasetDirPath,
apkDecompileDir + "_0.pickle")
else:
imageTensorFilePath = os.path.join(imageDatasetDirPath,
apkDecompileDir + "_1.pickle")
with open(imageTensorFilePath, "wb") as imageTensorFile:
torch.save(imageTensor, imageTensorFile)
# print(imageRawHeightList)
dataFrame = pd.DataFrame(imageRawHeightList)
# print(dataFrame)
statisticsDataFrame = dataFrame.describe()
# print(statisticsDataFrame)
statisticsDataFrame.columns = ["imageRawHeight"]
if isMalware:
statisticsCSVFilePath = "DataStatistics/malwareImageRawHeightStatistics.csv"
plotLabel = "malwareImageRawHeight"
else:
statisticsCSVFilePath = "DataStatistics/benignImageRawHeightStatistics.csv"
plotLabel = "benignImageRawHeight"
statisticsDataFrame.to_csv(statisticsCSVFilePath)
histAndBoxPlot(imageRawHeightList, plotLabel)
print(statisticsDataFrame)
if __name__ == "__main__":
# featureListJsonFilePath = "F:\\test\\decompileDataset\\benign\\com.lenderprolink.sstewart\\featureList.json"
# featureListToImage(featureListJsonFilePath)
# windows
# generateImageDataset("F:\\test\\decompileDataset\\malware", "F:\\test\\decompileDataset\\image")
# linux
generateImageDataset("/home/zhangxin/MyDroid/Dataset/2012/benign",
"/home/zhangxin/MyDroid/Dataset/2012/image", isMalware=False)
generateImageDataset("/home/zhangxin/MyDroid/Dataset/2012/malware",
"/home/zhangxin/MyDroid/Dataset/2012/image")