Package Libs ::
Module librecognition
|
|
1 """
2 (c) Immunity, Inc. 2004-2007
3
4
5 U{Immunity Inc.<http://www.immunityinc.com>}
6
7
8 Library for function recognizing
9
10 """
11 __VERSION__ = '1.2'
12
13
14 from libanalyze import *
15 from libdatatype import *
16 from libstackanalyze import *
17 import binascii
18 import struct
19 import hashlib
20 import re
21 import string
22 import debugger
23 import csv
24 import os
25
28 if not isinstance(dictionaries, list):
29 dictionaries = [ dictionaries ]
30
31 self.iterators = []
32 self.fds = []
33 self.idx = 0
34 for d in dictionaries:
35 try:
36 fd = open(d, "rb")
37 except:
38 fd = open(d, "w+b")
39 self.iterators.append(csv.reader(fd))
40 self.fds.append(fd)
42 for i in range(0, self.idx+1):
43 self.fds[i].seek(0)
44 self.idx = 0
45 return self
46
48 while self.iterators:
49 self.iterators.pop()
50 for fd in self.fds:
51 fd.close()
52 del self.fds
53
55 try:
56 data = self.iterators[self.idx].next()
57 except StopIteration:
58 if len(self.iterators) > self.idx+1:
59 self.idx += 1
60 return self.next()
61 else:
62 raise StopIteration
63
64 data.append(self.fds[self.idx].name)
65 return data
66
68 - def __init__(self, imm, dictionaryfiles=None):
69 """
70 This class try to recognize a function using different methods
71 (address/signature/heuristic).
72
73 @type imm: Debbuger OBJECT
74 @param imm: Debbuger instance
75
76 @type dictionaryfiles: STRING|LIST
77 @param dictionaryfiles: Name, or list of names, of .dat files inside the Data folder, where're stored the function
78 patterns. Use an empty string to use all .dat files in Data folder.
79 """
80 self.imm = imm
81 self.heuristicReferencesCache = {}
82 self.heuristicCache = {}
83 self.resolvCache = {}
84
85 if not dictionaryfiles:
86 dictionaryfiles = []
87 for file in os.listdir("Data"):
88 if file[-4:] == ".dat":
89 dictionaryfiles.append(os.path.join("Data", file))
90 self.dictionaries = MultiCSVIterator(dictionaryfiles)
91
93 """
94 Look up into our dictionaries to find a function match.
95
96 @type address: DWORD
97 @param address: Address of the function to search
98
99 @type heuristic: INTEGER
100 @param heuristic: heuristic threasold to consider a real function match
101
102 @rtype: STRING
103 @return: a STRING with the function's real name or the given address if there's no match
104 """
105
106
107 if self.resolvCache.has_key(address):
108 return self.resolvCache[address]
109
110
111 exact = self.makeFunctionHashExact(address)
112 for data in self.dictionaries:
113 if exact == data[4]:
114 self.resolvCache[address] = data[0]
115 break
116
117
118 if not self.resolvCache.has_key(address):
119 ref = self.selectBasicBlock(address)
120 posThreshold = 0
121 posName = ""
122 for data in self.dictionaries:
123
124
125 if ref == data[1]:
126 perc = self.checkHeuristic(address, data[2], data[3])
127
128 if perc >= heuristic and perc > posThreshold:
129 posThreshold = perc
130 posName = data[0]
131 if posName:
132 self.resolvCache[address] = posName
133
134
135 if not self.resolvCache.has_key(address):
136 self.resolvCache[address] = "%08X" % address
137
138 return self.resolvCache[address]
139
141 """
142 Check a given address with a precomputed hash of a function.
143 Return a percentage of match (you can use a threasold to consider a real match)
144
145 @type address: DWORD
146 @param address: Address of the function to compare
147
148 @type reference: STRING
149 @param reference: base64 representation of the compressed information about the function
150
151 @type refFirstCall: STRING
152 @param refFirstCall: the same, but following the function pointed by the first call in the first BB.
153 (OPTIONAL)
154
155 @rtype: INTEGER
156 @return: heuristic threasold to consider a real function match
157 """
158
159
160
161
162 if self.heuristicCache.has_key(address):
163 cfg = self.heuristicCache[address]
164 else:
165 cfg = self.makeFunctionHashHeuristic(address)
166 self.heuristicCache[address] = cfg
167
168
169 sha1 = hashlib.sha1(reference+refFirstCall).digest()
170 if self.heuristicReferencesCache.has_key(sha1):
171 refcfg = self.heuristicReferencesCache[sha1]
172 else:
173
174
175 refcfg = []
176 refcfg.append([])
177 refcfg.append([])
178 data = binascii.a2b_base64(reference)
179 for o in range(0,len(data),12):
180 (start, left, right) = struct.unpack("LLL",data[o:o+12])
181 refcfg[0].append([ start, left, right ])
182 if refFirstCall:
183 data = binascii.a2b_base64(refFirstCall)
184 for o in range(0,len(data),12):
185 (start, left, right) = struct.unpack("LLL",data[o:o+12])
186 refcfg[1].append([ start, left, right ])
187 self.heuristicReferencesCache[sha1] = refcfg
188
189 perc1 = self.compareHeuristic(cfg[0][:], refcfg[0][:])
190 if cfg[1] or refcfg[1]:
191 perc2 = self.compareHeuristic(cfg[1][:], refcfg[1][:])
192
193 perc = (perc1 + perc2) / 2
194 else:
195 perc = perc1
196
197 return perc
198
200
201
202
203
204
205
206 diff = eq = 0
207 checked = []
208
209 for info in cfg:
210 bbeq = value = 0
211 for rinfo in refcfg:
212 tmp = 0
213 if info[0] == rinfo[0]: tmp += 1
214 if info[1] == rinfo[1]: tmp += 1
215 if info[2] == rinfo[2]: tmp += 1
216 if tmp > bbeq:
217 bbeq = tmp
218 value = rinfo
219 if bbeq == 3: break
220 try:
221 idx=refcfg.index(value)
222 refcfg.pop(idx)
223 except ValueError:
224 pass
225
226 eq += bbeq
227 diff += 3 - bbeq
228
229
230 for rinfo in refcfg:
231 bbeq = value = 0
232 for info in cfg:
233 tmp = 0
234 if info[0] == rinfo[0]: tmp += 1
235 if info[1] == rinfo[1]: tmp += 1
236 if info[2] == rinfo[2]: tmp += 1
237 if tmp > bbeq:
238 bbeq = tmp
239 value = rinfo
240 if bbeq == 3: break
241 try:
242 idx=cfg.index(value)
243 cfg.pop(idx)
244 except ValueError:
245 pass
246
247 eq += bbeq
248 diff += 3 - bbeq
249
250
251 return eq * 100 / (eq + diff)
252
254 """
255 Consider:
256 - Control Flow Graph
257 - generalized instructions that:
258 access memory/write memory/use registers/use constant/call/jmp/jmc
259 and all his combinations.
260 - special case of functions with just 1 BB and a couple of calls (follow the first call)
261
262 @type address: DWORD
263 @param address: address of the function to hash
264
265 @type compressed: Boolean
266 @param compressed: return a compressed base64 representation or the raw data
267
268 @type followCalls: Boolean
269 @param followCalls: follow the first call in a single basic block function
270
271 @rtype: LIST
272 @return: the first element is described below and the second is the result of this same function but over the first
273 call of a single basic block function (if applies), each element is like this:
274 a base64 representation of the compressed version of each bb hash:
275 [4 bytes BB(i) start][4 bytes BB(i) 1st edge][4 bytes BB(i) 2nd edge]
276 0 <= i < BB count
277 or the same but like a LIST with raw data.
278 """
279
280 f = self.imm.getFunction(address)
281 bbs = f.getBasicBlocks()
282 bbmap = {}
283 cfg = {}
284
285
286 for bb in bbs:
287 cfg[bb.getStart()] = bb.getEdges()
288
289
290 for bb in bbs:
291 bbhash_data = []
292 for op in bb.getInstructions(self.imm):
293
294 instr = []
295 instr.append(op.getMemType())
296 instr.append(op.indexed)
297 instr.append(op.getCmdType())
298 instr.append(op.optype[0])
299 instr.append(op.optype[1])
300 instr.append(op.optype[2])
301 instr.append(op.getSize())
302 bbhash_data.append(self.hash_a_list(instr))
303 bbhash = self.hash_a_list(bbhash_data)
304 bbmap[bb.getStart()] = bbhash
305
306
307 rcfg = []
308 for start,edges in cfg.iteritems():
309 rstart = 0
310 redges = [0, 0]
311 rstart = bbmap[start]
312 if bbmap.has_key(edges[0]):
313 redges[0] = bbmap[edges[0]]
314 if bbmap.has_key(edges[1]):
315 redges[1] = bbmap[edges[1]]
316 rcfg.append([ rstart,redges[0],redges[1] ])
317
318
319 firstcall = []
320 if followCalls and len(bbs) == 1 and len(bbs[0].getCalls()) > 0:
321
322
323 op = self.imm.disasm(bbs[0].getCalls()[0])
324 if op.getJmpConst():
325 firstcall = self.makeFunctionHashHeuristic(op.getJmpConst(), compressed, followCalls=False)[0]
326
327 del op
328
329 del bbs
330 del f
331 rcfg.sort()
332
333 if compressed:
334
335 fhash = ""
336 for data in rcfg:
337
338 fhash += struct.pack("LLL", data[0], data[1], data[2])
339 return [ binascii.b2a_base64(fhash)[:-1], firstcall ]
340 else:
341 return [ rcfg, firstcall ]
342
344 """
345 Take a list and return a binary representation of his CRC32.
346
347 @type data: LIST
348 @param data: a list of elements to make the hash
349
350 @rtype: UNSIGNED LONG
351 @return: a hash of the given values
352 """
353
354 ret = 0
355 for elem in data:
356 ret = binascii.crc32(str(elem), ret)
357 return struct.unpack("L", struct.pack("l",ret))[0]
358
360 """
361 Search memory to find a function that fullfit the options.
362
363 @type csvline: STRING
364 @param csvline: A line of a Data CSV file. This's a simple support for copy 'n paste from a CSV file.
365
366 @type heuristic: INTEGER
367 @param heuristic: heuristic threasold to consider a real function match
368
369 @type module: STRING
370 @param module: name of a module to restrict the search
371
372 @rtype: LIST
373 @return: a list of tuples with possible function's addresses and the heauristic match percentage
374 """
375
376 line = csv.reader([csvline]).next()
377 if len(line) < 9: line[7] = ""
378 return self._searchFunctionByHeuristic(line[1], line[2], line[3], line[4], heuristic, module, string.split(line[7],"|"))
379
380 - def _searchFunctionByHeuristic(self, search, functionhash=None, firstcallhash=None, exact=None, heuristic = 90, module = None, firstbb = None):
381 """
382 Search memory to find a function that fullfit the options.
383
384 @type search: STRING
385 @param search: searchCommand string to make the first selection
386
387 @type functionhash: STRING
388 @param functionhash: the primary function hash (use makeFunctionHash to generate this value)
389
390 @type firstcallhash: STRING
391 @param firstcallhash: the hash of the first call on single BB functions (use makeFunctionHash to generate this value)
392
393 @type exact: STRING
394 @param exact: an exact function hash, this's a binary byte-per-byte hash (use makeFunctionHash to generate this value)
395
396 @type heuristic: INTEGER
397 @param heuristic: heuristic threasold to consider a real function match
398
399 @type module: STRING
400 @param module: name of a module to restrict the search
401
402 @type firstbb: STRING
403 @param firstbb: generalized assembler of the first BB (to search function begin)
404
405 @rtype: LIST
406 @return: a list of tuples with possible function's addresses and the heauristic match percentage
407 """
408
409
410
411 if isinstance(search, list):
412 search.reverse()
413 tmp = search[:]
414 if tmp: search = tmp.pop()
415 if tmp: functionhash = tmp.pop()
416 if tmp: firstcallhash = tmp.pop()
417 if tmp: exact = tmp.pop()
418 if tmp: version = tmp.pop()
419 if tmp: file = tmp.pop()
420 if tmp: firstbb = tmp.pop()
421
422
423 if not search or not functionhash:
424 return None
425
426 if not firstcallhash:
427 firstcallhash = ""
428
429 heu_addy = None
430 heu_perc = 0
431 poss_functions = []
432 poss_return = []
433 search = string.replace(search, "\\n","\n")
434 if search:
435 if module:
436
437 for key,mod in debugger.get_all_modules().iteritems():
438 if module.lower() in key.lower():
439 poss_functions += self.imm.searchCommandsOnModule(mod[0], search)
440 else:
441 poss_functions = self.imm.searchCommands(search)
442 if poss_functions:
443 for poss in poss_functions:
444
445 addy = self.imm.getFunctionBegin(poss[0])
446 if not addy:
447
448 for mod in self.imm.getAllModules().values():
449 if mod.getMainentry():
450
451 f = StackFunction(self.imm, mod.getMainentry())
452 if f.isInsideFunction(poss[0]):
453 addy = mod.getMainentry()
454 break
455 if not addy and firstbb:
456
457 addy = self.findBasicBlockHeuristically(poss[0], firstbb)
458 if not addy and firstbb:
459 tmp = self.findFirstBB(poss[0])
460 if tmp:
461
462 addy = self.findBasicBlockHeuristically(tmp, firstbb)
463 if not addy:
464 addy = poss[0]
465
466
467
468 if exact:
469 test = self.makeFunctionHashExact(addy)
470 if exact == test and not firstcallhash:
471
472
473 return [ (addy, 100) ]
474
475 perc = self.checkHeuristic(addy, functionhash, firstcallhash)
476
477 if perc >= heuristic:
478 poss_return.append( (addy,perc) )
479
480 return poss_return
481
483 """
484 Look up into our dictionaries to find a function match.
485
486 @type name: STRING
487 @param name: Name of the function to search
488
489 @type module: STRING
490 @param module: name of a module to restrict the search
491
492 @type version: STRING
493 @param version: restrict the search to the given version
494
495 @type heuristic: INTEGER
496 @param heuristic: heuristic threasold to consider a real function match
497
498 @rtype: LIST
499 @return: a list of tuples with possible function's addresses and the heauristic match percentage
500 """
501
502 name = name.lower()
503
504
505 poss_return = []
506 for data in self.dictionaries:
507 if name == data[0].lower():
508
509 if version and version.lower() != data[6].lower():
510 continue
511
512
513 if len(data) < 9: data[7] = ""
514 poss_return += self._searchFunctionByHeuristic(data[1], data[2], data[3], data[4], heuristic, module, string.split(data[7],"|"))
515 return poss_return
516
518 """
519 Return a SHA-1 hash of the function, taking the raw bytes as data.
520
521 @type address: DWORD
522 @param address: address of the function to hash
523
524 @rtype: STRING
525 @return: SHA-1 hash of the function
526 """
527
528 f = self.imm.getFunction(address)
529 bbs = f.getBasicBlocks()
530 bucket = ""
531 data = {}
532
533 for bb in bbs:
534 data[bb.getStart()] = self.imm.readMemory(bb.getStart(), bb.getSize())
535
536 keys = data.keys()
537 keys.sort()
538
539 for key in keys:
540 bucket += data[key]
541
542 hash = hashlib.sha1(bucket).hexdigest()
543 del bucket
544 del bbs
545 del f
546 return hash
547
549 """
550 Return a list with the best BB to use for a search and the heuristic hash
551 of the function. This two components are the function hash.
552
553 @type address: DWORD
554 @param address: address of the function to hash
555
556 @type compressed: Boolean
557 @param compressed: return a compressed base64 representation or the raw data
558
559 @rtype: LIST
560 @return: 1st element is the generalized instructions to use with searchCommand
561 2nd element is the heuristic function hash (makeFunctionHashHeuristic)
562 3rd element is an exact hash of the function (makeFunctionHashExact)
563 4th element is a LIST of generalized instructions of the first BB (to find the function begin)
564 """
565
566 ret = []
567 ret.append(self.selectBasicBlock(address))
568 ret.append(self.makeFunctionHashHeuristic(address, compressed))
569 ret.append(self.makeFunctionHashExact(address))
570 ret.append(self.generalizeFunction(address)[1][1])
571 return ret
572
574 bbs = self.generalizeFunction(address)
575
576
577
578 hpoints = bb = 0
579 for id, instrs in bbs[1].iteritems():
580 map = {}
581 sum = 0
582 for instr in instrs:
583 sum += 1
584 base = instr.split(" ")
585 if "REP" in base[0]:
586 base = base[0] + " " + base[1]
587 else:
588 base = base[0]
589 map[base] = True
590 if sum > 7: break
591
592
593
594 points = sum + len(map)*4
595 if points > hpoints:
596
597
598 hpoints = points
599 bb = id
600 ret = ""
601 if bb:
602 ret = string.join(bbs[1][bb][0:8],"\\n")
603 del bbs
604 return ret
605
607 """
608 Take an address an return a generalized version of the function, dismissing
609 address and register dependant information.
610
611 @type address: DWORD
612 @param address: address to the function begin
613
614 @rtype: LIST
615 @return: the 1st value is a DICTIONARY of a Control Flow Graph of the
616 BB conexions (each BB have an arbitrary ID)
617 the 2nd value is a DICTIONARY using this arbitrary BB ID as the key
618 and a LIST of searchCommand suitable, generalized instructions.
619 """
620 bbcount = 1
621 bbmap = {}
622 cfg = {}
623 bbinfo = {}
624
625 f = self.imm.getFunction(address)
626 bbs = f.getBasicBlocks()
627
628
629 for bb in bbs:
630 if not bbmap.has_key(bb.getStart()):
631 bbmap[bb.getStart()] = bbcount
632 bbcount += 1
633 if not bbmap.has_key(bb.getEdges()[0]):
634 bbmap[bb.getEdges()[0]] = bbcount
635 bbcount += 1
636 if not bbmap.has_key(bb.getEdges()[1]):
637 bbmap[bb.getEdges()[1]] = bbcount
638 bbcount += 1
639
640 cfg[bbmap[bb.getStart()]] = [ bbmap[bb.getEdges()[0]], bbmap[bb.getEdges()[1]] ]
641
642 regex = []
643 for op in bb.getInstructions(self.imm):
644 asm = self.generalizeInstruction(op)
645 regex.append(asm)
646
647 bbinfo[bbmap[bb.getStart()]] = regex
648
649 del bbs
650 del f
651 del regex
652 return [ cfg, bbinfo ]
653
655 """
656 Generalize an instruction given an address or an opCode instance
657
658 @type inp: DWORD|OpCode OBJECT
659 @param inp: address to generalize or opcode to generalize
660
661 @rtype: STRING
662 @return: a generalized assembler instruction
663 """
664 if not isinstance(inp, opCode):
665 op = self.imm.disasm(inp)
666 else: op = inp
667
668 asm = op.getDisasm()
669
670
671 if op.isConditionalJmp():
672 asm = "JCC CONST"
673 if op.getImmConst() or op.operand[0][0] == DEC_CONST:
674
675 r = re.compile("(?<=[ ,\[])[a-z0-9_\.\@\-]*%X" % op.getImmConst(), re.I)
676 asm = r.sub('CONST', asm)
677 if op.getImmConst() > 0xFFFFBFFF:
678
679 r = re.compile("(?<=[ ,\[])[a-z0-9_\.\@\-]*\%X" % (op.getImmConst()-0x100000000), re.I)
680 asm = r.sub('CONST', asm)
681 if op.getAddrConst():
682 if not op.indexed:
683 asm = asm.split("[")[0]+"[CONST]"+asm.split("]")[1]
684 else:
685 tmp = "%+X" % struct.unpack("l", struct.pack("L", op.getAddrConst()))
686 asm = asm.replace(tmp,"+CONST")
687 if op.getJmpConst():
688 r = re.compile("(?<=[ ,\[])[a-z0-9_\.\-\@]*%X" % op.getJmpConst(), re.I)
689 asm = r.sub('CONST', asm)
690
691
692 asm = re.sub(r'(?i)<[a-z\.&_0-9\@\-]+>', "CONST", asm)
693
694
695 asm = re.sub(r'(?i)[a-z\.&_0-9\@\-]+\.[a-z\.&_0-9\@\-]+',"CONST", asm)
696
697
698 if not op.getAddrConst() or not op.indexed:
699 asm = re.sub(r'(?i)(?<![A-Z])E([ABCD]X|[SD]I)(?![A-Z])', 'R32', asm)
700 else:
701
702 asm = re.sub(r'(?i)(?<![A-Z\+\-\[])E([ABCD]X|[SD]I)(?![A-Z])', 'R32', asm)
703 asm = re.sub(r'(?i)(?<![A-Z])([ABCD]X|[SD]I)(?![A-Z])', 'R16', asm)
704 asm = re.sub(r'(?i)(?<![A-Z])[ABCD][HL](?![A-Z])', 'R8', asm)
705
706
707
708
709 return asm
710
712 """
713 Try to match a generalized BB with an address range (moving backward).
714
715 @type address: DWORD
716 @param address: address used to match with the generalized BB
717
718 @type firstbb: LIST
719 @param firstbb: a list of generalized assembler instructions
720
721 @type maxsteps: INTEGER
722 @param maxsteps: max amount of steps to go backward looking for a BB
723
724 @rtype: DWORD|None
725 @return: starting address of the BB that match with the generalized version or None if we don't find it
726 """
727
728 index = address
729 instr = 0
730 while instr < maxsteps:
731 num = 0
732 notmatch = False
733
734 for cmp in firstbb:
735 gen = self.generalizeInstruction(self.imm.disasmForward(index, num))
736 if gen != cmp:
737 notmatch = True
738
739 break
740 num += 1
741
742 if notmatch:
743 index = self.imm.disasmBackward(index, 1).getAddress()
744 instr += 1
745 else:
746
747 return index
748
749 return None
750
752 """
753 The main idea is traverse a function backward following Xrefs until we reach a point where there's no more Xrefs other than CALLs
754
755 @type address: DWORD
756 @param address: address used find the first BB
757
758 @rtype: DWORD|None
759 @return: Address of the first BB of the function or None if we don't find it
760 """
761
762 poss = []
763
764 xref = self.imm.getXrefFrom(address)
765 for info in xref:
766 if info[1] != 3:
767
768 poss.append(info[0])
769
770 if not xref and not recursive:
771 return None
772 if not poss:
773 return address
774
775 for addy in poss:
776 tmp = self.findFirstBB(addy, True)
777 if tmp:
778 return addy
779
780 return None
781