flowfusion/dataflow.py
Yuancheng Jiang 81577af68b init
2024-11-27 15:26:18 +08:00

127 lines
4.9 KiB
Python
Executable File

import re
"""
dataflow.py is a module for conducting lightweight source code analysis on PHP code.
It performs a very coarse-grained dataflow extraction, which is different from traditional
taint analysis. The focus is on tolerance to false positives (FP) and false negatives (FN)
in the context of fuzz testing.
"""
class PHPFastDataflow:
"""
A class that performs fast, coarse-grained dataflow analysis on PHP code.
It does not guarantee completeness but aims for soundness.
"""
def __init__(self):
"""
Initializes the PHPFastDataflow object with empty variables and dataflows.
"""
self.variables = [] # List to store extracted variables from PHP code
self.dataflows = [] # List of lists to store dataflows between variables
def clean(self):
"""
Resets the variables and dataflows to empty lists.
"""
self.variables = []
self.dataflows = []
def extract_variables(self):
"""
Extracts all PHP variables from the PHP code using a regular expression.
It ensures that each variable is unique by converting the list to a set.
"""
# Regular expression to match valid PHP variables
regex = r"\$[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*"
# Find all PHP variables in the provided code
self.variables = re.findall(regex, self.phpcode)
# Ensure unique variables by converting the list to a set
self.variables = list(set(self.variables))
def analyze_php_line(self, php_line):
"""
Analyzes a single line of PHP code to find variables and check
if multiple variables are interacting in the same line.
Returns:
- A tuple (True, [variables]) if multiple variables are found.
- A tuple (False, None) if no interaction between variables is detected.
"""
regex = r"\$[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*"
variables = list(set(re.findall(regex, php_line)))
if len(variables) > 1:
return (True, variables)
else:
return (False, None)
def merge_dataflows(self):
"""
Merges the dataflows by grouping variables that interact with each other.
Variables that appear together in dataflow analysis are merged into a single
list to represent their relationship.
"""
list_of_lists = self.variables
# Convert any single variables to lists to ensure consistent structure
for i in range(len(list_of_lists)):
if type(list_of_lists[i]) != list:
list_of_lists[i] = [list_of_lists[i]]
# Initialize an empty list to store merged sublists
merged_lists = []
# Iterate through each sublist (representing variables that interact)
for sublist in list_of_lists:
merged_with_existing = False
for merged_sublist in merged_lists:
# If any variable in the current sublist exists in a merged sublist,
# merge them by extending the merged list
if any(var in merged_sublist for var in sublist):
merged_sublist.extend(var for var in sublist if var not in merged_sublist)
merged_with_existing = True
break
# If no merge occurred, add the current sublist as a new group
if not merged_with_existing:
merged_lists.append(sublist)
# Update the variables with the merged dataflows
self.variables = merged_lists
def extract_dataflow(self):
"""
Extracts dataflows from the PHP code by analyzing each line of code.
It identifies variables and their interactions, grouping them into dataflows.
"""
for eachline in self.phpcode.split('\n'):
result, variables = self.analyze_php_line(eachline)
if result:
for each_var in variables:
# If the variable is already in the list, remove and replace it
if each_var in self.variables:
self.variables.remove(each_var)
# Add the new set of interacting variables
self.variables.append(variables)
# Merge dataflows to group interacting variables
self.merge_dataflows()
def analyze(self, phpcode):
"""
The main function to analyze a given PHP code for dataflows.
It extracts variables and their interactions to produce a list
of dataflows.
Args:
- phpcode: The PHP source code to analyze.
Returns:
- A list of merged dataflows, each representing groups of interacting variables.
"""
self.phpcode = phpcode
self.clean() # Reset variables and dataflows
self.extract_variables() # Extract variables from the code
self.extract_dataflow() # Extract dataflows between variables
return self.variables