Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions capa/features/address.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import abc
from typing import Optional


class Address(abc.ABC):
Expand Down Expand Up @@ -52,51 +53,59 @@ def __hash__(self):
class ProcessAddress(Address):
"""an address of a process in a dynamic execution trace"""

def __init__(self, pid: int, ppid: int = 0):
def __init__(self, pid: int, ppid: int = 0, id: Optional[int] = None):
assert ppid >= 0
assert pid > 0
self.ppid = ppid
self.pid = pid
self.id = id

def __repr__(self):
return "process(%s%s)" % (
s = "process(%s%s%s)" % (
f"ppid: {self.ppid}, " if self.ppid > 0 else "",
f"pid: {self.pid}",
f", id: {self.id}" if self.id is not None else "",
)
return s

def __hash__(self):
return hash((self.ppid, self.pid))
return hash((self.ppid, self.pid, self.id))

def __eq__(self, other):
assert isinstance(other, ProcessAddress)
return (self.ppid, self.pid) == (other.ppid, other.pid)
return (self.ppid, self.pid, self.id) == (other.ppid, other.pid, other.id)

def __lt__(self, other):
assert isinstance(other, ProcessAddress)
return (self.ppid, self.pid) < (other.ppid, other.pid)
self_id = self.id if self.id is not None else -1
other_id = other.id if other.id is not None else -1
return (self.ppid, self.pid, self_id) < (other.ppid, other.pid, other_id)


class ThreadAddress(Address):
"""addresses a thread in a dynamic execution trace"""

def __init__(self, process: ProcessAddress, tid: int):
def __init__(self, process: ProcessAddress, tid: int, id: Optional[int] = None):
assert tid >= 0
self.process = process
self.tid = tid
self.id = id

def __repr__(self):
return f"{self.process}, thread(tid: {self.tid})"
return f"{self.process}, thread(tid: {self.tid}{f', id: {self.id}' if self.id is not None else ''})"

def __hash__(self):
return hash((self.process, self.tid))
return hash((self.process, self.tid, self.id))

def __eq__(self, other):
assert isinstance(other, ThreadAddress)
return (self.process, self.tid) == (other.process, other.tid)
return (self.process, self.tid, self.id) == (other.process, other.tid, other.id)

def __lt__(self, other):
assert isinstance(other, ThreadAddress)
return (self.process, self.tid) < (other.process, other.tid)
self_id = self.id if self.id is not None else -1
other_id = other.id if other.id is not None else -1
return (self.process, self.tid, self_id) < (other.process, other.tid, other_id)


class DynamicCallAddress(Address):
Expand All @@ -114,7 +123,10 @@ def __hash__(self):
return hash((self.thread, self.id))

def __eq__(self, other):
return isinstance(other, DynamicCallAddress) and (self.thread, self.id) == (other.thread, other.id)
return isinstance(other, DynamicCallAddress) and (self.thread, self.id) == (
other.thread,
other.id,
)

def __lt__(self, other):
assert isinstance(other, DynamicCallAddress)
Expand Down
34 changes: 22 additions & 12 deletions capa/features/extractors/cape/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
import logging
from typing import Iterator

from capa.features.file import Export, Import, Section
from capa.features.common import String, Feature
from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
from capa.features.extractors.helpers import generate_symbols
from capa.features.extractors.cape.models import CapeReport
from capa.features.address import NO_ADDRESS, AbsoluteVirtualAddress, Address, ProcessAddress
from capa.features.common import Feature, String
from capa.features.extractors.base_extractor import ProcessHandle
from capa.features.extractors.cape.models import CapeReport
from capa.features.extractors.helpers import generate_symbols
from capa.features.file import Export, Import, Section

logger = logging.getLogger(__name__)

Expand All @@ -30,22 +30,32 @@ def get_processes(report: CapeReport) -> Iterator[ProcessHandle]:
"""
get all the created processes for a sample
"""
seen_processes = {}
counts: dict[tuple[int, int], int] = {}
for process in report.behavior.processes:
key = (process.parent_id, process.process_id)
counts[key] = counts.get(key, 0) + 1

seen_processes: dict[tuple[int, int], list] = {}
seq: dict[tuple[int, int], int] = {}
for process in report.behavior.processes:
addr = ProcessAddress(pid=process.process_id, ppid=process.parent_id)
key = (process.parent_id, process.process_id)
seq[key] = seq.get(key, 0) + 1
process_id = seq[key] - 1 if counts[key] > 1 else None

addr = ProcessAddress(pid=process.process_id, ppid=process.parent_id, id=process_id)
yield ProcessHandle(address=addr, inner=process)

# check for pid and ppid reuse
if addr not in seen_processes:
seen_processes[addr] = [process]
if key not in seen_processes:
seen_processes[key] = [process]
else:
logger.warning(
"pid and ppid reuse detected between process %s and process%s: %s",
process,
"es" if len(seen_processes[addr]) > 1 else "",
seen_processes[addr],
"es" if len(seen_processes[key]) > 1 else "",
seen_processes[key],
)
seen_processes[addr].append(process)
seen_processes[key].append(process)


def extract_import_names(report: CapeReport) -> Iterator[tuple[Feature, Address]]:
Expand Down
21 changes: 17 additions & 4 deletions capa/features/extractors/cape/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
import logging
from typing import Iterator

from capa.features.common import String, Feature
from capa.features.address import Address, ThreadAddress
from capa.features.common import Feature, String
from capa.features.extractors.base_extractor import ProcessHandle, ThreadHandle
from capa.features.extractors.cape.models import Process
from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle

logger = logging.getLogger(__name__)

Expand All @@ -31,8 +31,21 @@ def get_threads(ph: ProcessHandle) -> Iterator[ThreadHandle]:
process: Process = ph.inner
threads: list[int] = process.threads

for thread in threads:
address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
counts: dict[int, int] = {}
for tid in threads:
counts[tid] = counts.get(tid, 0) + 1

seq: dict[int, int] = {}
warned_tids: set[int] = set()
for tid in threads:
if counts[tid] > 1 and tid not in warned_tids:
logger.warning("tid reuse detected for tid %d in process %s", tid, ph.address)
warned_tids.add(tid)

seq[tid] = seq.get(tid, 0) + 1
thread_id = seq[tid] - 1 if counts[tid] > 1 else None

address: ThreadAddress = ThreadAddress(process=ph.address, tid=tid, id=thread_id)
yield ThreadHandle(address=address, inner={})
Comment on lines +38 to 49

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This function handles TID reuse by assigning a lifecycle ID, which is great. However, unlike the corresponding get_processes function which warns about PID/PPID reuse, there's no warning here for TID reuse. For consistency and better diagnostics, consider adding a warning when a TID is reused within a process.

    seq: dict[int, int] = {}
    warned_tids: set[int] = set()
    for tid in threads:
        if counts[tid] > 1 and tid not in warned_tids:
            logger.warning("TID reuse detected for tid %d in process %s", tid, ph.address)
            warned_tids.add(tid)

        seq[tid] = seq.get(tid, 0) + 1
        thread_id = seq[tid] - 1 if counts[tid] > 1 else None

        address: ThreadAddress = ThreadAddress(process=ph.address, tid=tid, id=thread_id)
        yield ThreadHandle(address=address, inner={})



Expand Down
32 changes: 20 additions & 12 deletions capa/features/extractors/vmray/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,31 @@
# limitations under the License.

import logging
from typing import Iterator
from pathlib import Path
from typing import Iterator

import capa.helpers
import capa.features.extractors.vmray.call
import capa.features.extractors.vmray.file
import capa.features.extractors.vmray.global_
from capa.features.common import Feature
import capa.helpers
from capa.features.address import (
NO_ADDRESS,
AbsoluteVirtualAddress,
Address,
ThreadAddress,
ProcessAddress,
DynamicCallAddress,
AbsoluteVirtualAddress,
ProcessAddress,
ThreadAddress,
)
from capa.features.extractors.vmray import VMRayAnalysis, VMRayMonitorThread, VMRayMonitorProcess
from capa.features.extractors.vmray.models import PARAM_TYPE_STR, ParamList, FunctionCall
from capa.features.common import Feature
from capa.features.extractors.base_extractor import (
CallHandle,
DynamicFeatureExtractor,
ProcessHandle,
SampleHashes,
ThreadHandle,
ProcessHandle,
DynamicFeatureExtractor,
)
from capa.features.extractors.vmray import VMRayAnalysis, VMRayMonitorProcess, VMRayMonitorThread
from capa.features.extractors.vmray.models import PARAM_TYPE_STR, FunctionCall, ParamList

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -99,7 +99,11 @@ def get_processes(self) -> Iterator[ProcessHandle]:
)
continue

address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid)
address: ProcessAddress = ProcessAddress(
pid=monitor_process.pid,
ppid=monitor_process.ppid,
id=monitor_process.monitor_id,
)
yield ProcessHandle(address, inner=monitor_process)

def extract_process_features(self, ph: ProcessHandle) -> Iterator[tuple[Feature, Address]]:
Expand All @@ -114,7 +118,11 @@ def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]:
monitor_thread: VMRayMonitorThread = self.analysis.monitor_threads[monitor_thread_id]

address: ThreadAddress = ThreadAddress(process=ph.address, tid=monitor_thread.tid)
address: ThreadAddress = ThreadAddress(
process=ph.address,
tid=monitor_thread.tid,
id=monitor_thread.monitor_id,
)
yield ThreadHandle(address=address, inner=monitor_thread)

def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[tuple[Feature, Address]]:
Expand Down
Loading
Loading