Daniel Nichols Mazin Karjikar commited on
Commit
8c0b7ca
1 Parent(s): 6f00050

Add function parsing (#3)

Browse files

* added source code function parsing functionality - still needs to be tested

* added parameter to slowest function parser, can now test multiple prompt formatter functionality

* cleaned up function parsing formatter

* now importing function grabber function instead of starting subprocess

---------

Co-authored-by: Mazin Karjikar <mazin@pssg-mordor.umiacs.umd.edu>

Files changed (3) hide show
  1. requirements.txt +2 -1
  2. src/function_grabber.py +100 -0
  3. src/rag.py +63 -8
requirements.txt CHANGED
@@ -3,4 +3,5 @@ hatchet==1.4.0
3
  google-generativeai==0.7.2
4
  openai==1.37.0
5
  tiktoken==0.7.0
6
- llama-cpp-python==0.2.90
 
 
3
  google-generativeai==0.7.2
4
  openai==1.37.0
5
  tiktoken==0.7.0
6
+ llama-cpp-python==0.2.90
7
+ clang==17.0.6
src/function_grabber.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ This script extracts the function that contains a specified line number in a C++ file.
2
+ """
3
+ from argparse import ArgumentParser
4
+ import os
5
+ from typing import List, Tuple
6
+ import clang.cindex
7
+
8
+
9
+ def get_functions_at_lines(fpath: os.PathLike, lines: List[int], clang_path: os.PathLike = None) -> List[Tuple[str, Tuple[int, int]]]:
10
+ """ Find all the functions that contain the specified lines in a file.
11
+ """
12
+ functions = []
13
+ for line in lines:
14
+
15
+ # check if we've already found this line
16
+ if any(start <= line <= end for _, (start, end) in functions):
17
+ continue
18
+
19
+ function_body, range = get_function_at_line(fpath, line, clang_path=clang_path)
20
+ if function_body:
21
+ functions.append((function_body, range))
22
+ return functions
23
+
24
+
25
+ def remove_macros(filename: str, line_numbers: List[int]) -> List[int]:
26
+ """ Remove all macros from a file. Lines is a list of numbers that you would like to have mapped
27
+ to their new line numbers after the macros are removed.
28
+ """
29
+ with open(filename, 'r') as f:
30
+ lines = f.readlines()
31
+
32
+ new_line_numbers = []
33
+ new_lines = []
34
+ num_removed = 0
35
+ for i, line in enumerate(lines):
36
+ if line.startswith('#'):
37
+ num_removed += 1
38
+ else:
39
+ new_lines.append(line)
40
+
41
+ if i in line_numbers:
42
+ new_line_numbers.append(i - num_removed)
43
+
44
+ with open(filename, 'w') as f:
45
+ f.write(''.join(new_lines))
46
+
47
+ return new_line_numbers
48
+
49
+
50
+ def get_function_at_line(filename, line_number, clang_path=None):
51
+ if clang_path and not clang.cindex.Config.loaded:
52
+ clang.cindex.Config.set_library_file(clang_path)
53
+ index = clang.cindex.Index.create()
54
+
55
+ try:
56
+ translation_unit = index.parse(filename)
57
+ except clang.cindex.TranslationUnitLoadError:
58
+ return None, None
59
+
60
+ def find_function(node, line_number):
61
+ # Check if node is function-like and contains the line number
62
+ if node.kind == clang.cindex.CursorKind.FUNCTION_DECL or node.kind == clang.cindex.CursorKind.CXX_METHOD:
63
+ start_line = node.extent.start.line
64
+ end_line = node.extent.end.line
65
+
66
+ #print(f"Checking function {node.spelling} at lines {start_line} - {end_line}")
67
+
68
+ if start_line <= line_number <= end_line:
69
+ return node
70
+
71
+ for child in node.get_children():
72
+ result = find_function(child, line_number)
73
+ if result:
74
+ return result
75
+ return None
76
+
77
+ # Start from the root node (translation unit) and find the function
78
+ function_node = find_function(translation_unit.cursor, line_number)
79
+
80
+ if function_node:
81
+ start_line = function_node.extent.start.line
82
+ end_line = function_node.extent.end.line
83
+ with open(filename, 'r') as f:
84
+ lines = f.readlines()
85
+ return ''.join(lines[start_line - 1:end_line]), (start_line, end_line)
86
+ else:
87
+ return None, None
88
+
89
+
90
+ if __name__ == "__main__":
91
+ parser = ArgumentParser(description="Extract the function that contains a specified line number in a C++ file.")
92
+ parser.add_argument("filename", help="The C++ file to analyze")
93
+ parser.add_argument("line_number", type=int, help="The line number to search for")
94
+ parser.add_argument("--clang_path", help="Path to libclang.so if necessary")
95
+ args = parser.parse_args()
96
+
97
+ result, rnge = get_function_at_line(args.filename, args.line_number, clang_path=args.clang_path)
98
+ if result is None:
99
+ result = f"No function found at line {args.line_number}"
100
+ print(result, rnge)
src/rag.py CHANGED
@@ -8,6 +8,7 @@ from typing import Optional, List, Mapping
8
  from os import PathLike
9
  from os.path import basename
10
  import random
 
11
 
12
  from profiles import Profile
13
 
@@ -63,8 +64,9 @@ class BasicPromptFormatter(PerfGuruPromptFormatter):
63
 
64
  class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
65
 
66
- def __init__(self):
67
  super().__init__("slowest_function")
 
68
 
69
  def format_prompt(self, prompt: str, code_paths: List[PathLike], profile_path: Optional[PathLike] = None, profile_type: Optional[str] = None, error_fn: Optional[callable] = None) -> str:
70
  if not code_paths:
@@ -83,11 +85,10 @@ class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
83
  if error_fn:
84
  error_fn("Profile type must be provided if a profile file is provided.")
85
  return None
86
- k = 1
87
  profile = self._read_profile(profile_path, profile_type)
88
- slowest = profile.gf.dataframe.nlargest(k, 'time')
89
- function_names = [slowest['name'].values[i] for i in range(k) if i < len(slowest['name'].values)]
90
- execution_times = [slowest['time'].values[i] for i in range(k) if i < len(slowest['name'].values)]
91
  # print(profile_content)
92
  hot_path = profile.gf.hot_path()
93
  hot_path_functions = []
@@ -95,7 +96,7 @@ class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
95
  for node in hot_path:
96
  if "name" in node.frame.attrs:
97
  hot_path_functions.append(node.frame["name"])
98
- hot_path_functions = hot_path_functions[:k]
99
 
100
  profile_content = (f"The slowest functions are {function_names} and they took {execution_times} seconds, respectively." +
101
  f" Also, these functions were in the hot path: {hot_path_functions}.")
@@ -106,9 +107,63 @@ class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
106
 
107
  return f"Code:\n{concatenated_code}\n\n{profile_type} Profile:\n{profile_content}\n\n{prompt}"
108
 
109
- AVAILABLE_FORMATTERS = [SlowestFunctionPromptFormatter()]
110
- # AVAILABLE_FORMATTERS.append(BasicPromptFormatter())
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  def select_random_formatter() -> PerfGuruPromptFormatter:
114
  return random.choice(AVAILABLE_FORMATTERS)
 
8
  from os import PathLike
9
  from os.path import basename
10
  import random
11
+ from function_grabber import get_function_at_line
12
 
13
  from profiles import Profile
14
 
 
64
 
65
  class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
66
 
67
+ def __init__(self, k):
68
  super().__init__("slowest_function")
69
+ self.k = k
70
 
71
  def format_prompt(self, prompt: str, code_paths: List[PathLike], profile_path: Optional[PathLike] = None, profile_type: Optional[str] = None, error_fn: Optional[callable] = None) -> str:
72
  if not code_paths:
 
85
  if error_fn:
86
  error_fn("Profile type must be provided if a profile file is provided.")
87
  return None
 
88
  profile = self._read_profile(profile_path, profile_type)
89
+ slowest = profile.gf.dataframe.nlargest(self.k, 'time')
90
+ function_names = [slowest['name'].values[i] for i in range(self.k) if i < len(slowest['name'].values)]
91
+ execution_times = [slowest['time'].values[i] for i in range(self.k) if i < len(slowest['name'].values)]
92
  # print(profile_content)
93
  hot_path = profile.gf.hot_path()
94
  hot_path_functions = []
 
96
  for node in hot_path:
97
  if "name" in node.frame.attrs:
98
  hot_path_functions.append(node.frame["name"])
99
+ hot_path_functions = hot_path_functions[:self.k]
100
 
101
  profile_content = (f"The slowest functions are {function_names} and they took {execution_times} seconds, respectively." +
102
  f" Also, these functions were in the hot path: {hot_path_functions}.")
 
107
 
108
  return f"Code:\n{concatenated_code}\n\n{profile_type} Profile:\n{profile_content}\n\n{prompt}"
109
 
110
+ class SlowestFunctionParsedPromptFormatter(PerfGuruPromptFormatter):
 
111
 
112
+ def __init__(self):
113
+ super().__init__("slowest_function_parsed")
114
+
115
+ def format_prompt(self, prompt: str, code_paths: List[PathLike], profile_path: Optional[PathLike] = None, profile_type: Optional[str] = None, error_fn: Optional[callable] = None) -> str:
116
+ if not code_paths:
117
+ if error_fn:
118
+ error_fn("No code files provided. At least one code file must be provided.")
119
+ return None
120
+
121
+ concatenated_code = ""
122
+ profile_content = ""
123
+
124
+ if profile_path:
125
+ if not profile_type:
126
+ if error_fn:
127
+ error_fn("Profile type must be provided if a profile file is provided.")
128
+ return None
129
+
130
+ k = 1
131
+ profile = self._read_profile(profile_path, profile_type)
132
+ slowest = profile.gf.dataframe.nlargest(k, 'time')
133
+ function_name = slowest['name'].values[0] if len(slowest['name'].values) > 0 else None
134
+ line_number = slowest['line'].values[0] if len(slowest['line'].values) > 0 else None
135
+ code = None
136
+
137
+ if line_number:
138
+ filename = ""
139
+ code_file_contents = self._read_code_files(code_paths)
140
+ for code_path, content in code_file_contents.items():
141
+ filename = basename(code_path)
142
+ code, _ = get_function_at_line(filename, str(line_number))
143
+ if code:
144
+ break
145
+
146
+ if code:
147
+ concatenated_code = f"{fname}:\n{code}\n\n"
148
+ print("Only function code:", concatenated_code)
149
+
150
+ profile_content = (f"The slowest function is {function_name}.")
151
+ print(profile_content)
152
+
153
+ if concatenated_code == "":
154
+ code_file_contents = self._read_code_files(code_paths)
155
+ for code_path, content in code_file_contents.items():
156
+ fname = basename(code_path)
157
+ concatenated_code += f"{fname}:\n{content}\n\n"
158
+
159
+ return f"Code:\n{concatenated_code}\n\n{profile_type} Profile:\n{profile_content}\n\n{prompt}"
160
+
161
+ AVAILABLE_FORMATTERS = []
162
+ AVAILABLE_FORMATTERS.append(SlowestFunctionPromptFormatter(k=1))
163
+ AVAILABLE_FORMATTERS.append(SlowestFunctionPromptFormatter(k=5))
164
+ AVAILABLE_FORMATTERS.append(SlowestFunctionPromptFormatter(k=10))
165
+ # AVAILABLE_FORMATTERS.append(BasicPromptFormatter())
166
+ AVAILABLE_FORMATTERS.append(SlowestFunctionParsedPromptFormatter())
167
 
168
  def select_random_formatter() -> PerfGuruPromptFormatter:
169
  return random.choice(AVAILABLE_FORMATTERS)