pszemraj commited on
Commit
2d980d5
·
1 Parent(s): 5dfe75e

✨ add and update utils

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show
  1. utils.py +119 -3
utils.py CHANGED
@@ -1,12 +1,12 @@
1
  """
2
  utils.py - Utility functions for the project.
3
  """
4
-
5
  import logging
 
6
  import re
7
  import subprocess
8
  from collections import defaultdict, deque
9
- from datetime import datetime
10
  from itertools import combinations, islice
11
  from pathlib import Path
12
  from typing import List
@@ -25,6 +25,86 @@ STOPWORDS = set(
25
  )
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def validate_pytorch2(torch_version: str = None):
29
  torch_version = torch.__version__ if torch_version is None else torch_version
30
 
@@ -46,7 +126,7 @@ def get_timestamp(detailed=False) -> str:
46
  )
47
 
48
 
49
- def truncate_word_count(text, max_words=512):
50
  """
51
  truncate_word_count - a helper function for the gradio module
52
  Parameters
@@ -141,6 +221,42 @@ def textlist2html(text_batches):
141
  return text_html_block
142
 
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def extract_keywords(
145
  text: str, num_keywords: int = 3, window_size: int = 5, kw_max_len: int = 20
146
  ) -> List[str]:
 
1
  """
2
  utils.py - Utility functions for the project.
3
  """
 
4
  import logging
5
+ import os
6
  import re
7
  import subprocess
8
  from collections import defaultdict, deque
9
+ from datetime import datetime, timedelta
10
  from itertools import combinations, islice
11
  from pathlib import Path
12
  from typing import List
 
25
  )
26
 
27
 
28
+ def remove_stagnant_files(
29
+ freq: str = "hourly",
30
+ search_path: str = ".",
31
+ substring="DocSumm",
32
+ remove_suffix=".txt",
33
+ ):
34
+ """
35
+ remove_stagnant_files - Remove files that have not been modified in a certain amount of time.
36
+
37
+ :param str freq: frequency of file removal, defaults to "hourly"
38
+ :param str search_path: location to search for files, defaults to "."
39
+ :param str substring: substring to search for in file names, defaults to "DocSumm"
40
+ :param str remove_suffix: suffix of files to remove, defaults to ".txt"
41
+ :raises ValueError: if freq is not one of "hourly", "daily", or "weekly"
42
+ """
43
+ current_time = datetime.now()
44
+ search_path = Path(search_path)
45
+
46
+ if freq == "hourly":
47
+ time_threshold = current_time - timedelta(hours=1)
48
+ elif freq == "daily":
49
+ time_threshold = current_time - timedelta(days=1)
50
+ elif freq == "weekly":
51
+ time_threshold = current_time - timedelta(weeks=1)
52
+ else:
53
+ raise ValueError(
54
+ "Invalid frequency. Supported values are 'hourly', 'daily', and 'weekly'."
55
+ )
56
+
57
+ files_to_remove = []
58
+ potential_files = [
59
+ f for f in search_path.iterdir() if f.is_file() and f.suffix == remove_suffix
60
+ ]
61
+ logging.info(f"Found {len(potential_files)} files.")
62
+ for candidate in potential_files:
63
+ if (
64
+ candidate.is_file()
65
+ and substring in candidate.name
66
+ and candidate.stat().st_mtime < time_threshold.timestamp()
67
+ ):
68
+ files_to_remove.append(candidate)
69
+ logging.debug(f"File {candidate} last modified at {candidate.stat().st_mtime}")
70
+ logging.info(f"Removing {len(files_to_remove)} files.")
71
+ for file_path in files_to_remove:
72
+ file_path.unlink()
73
+ logging.debug(f"Removed files: {files_to_remove}")
74
+
75
+
76
+ def compare_model_size(model_name: str, threshold: int = 500) -> bool:
77
+ """
78
+ compare_model_size - compare string representations of model size to a threshold
79
+
80
+ :param str model_name: the model name to compare
81
+ :param int threshold: the threshold to compare against in millions, defaults to 500
82
+ :return: True if the model size is greater than the threshold, False or None otherwise
83
+ """
84
+ pattern = r"(\d+)(M|G|k|b)?" # param regex
85
+
86
+ matches = re.findall(pattern, model_name)
87
+ if not matches:
88
+ return None
89
+
90
+ # Extract the parameter count and unit from the last match
91
+ parameter_count, unit = matches[-1]
92
+
93
+ parameter_count = int(parameter_count) # Convert to an integer
94
+
95
+ # Convert to the standard form (M for million, G for billion, k for thousand)
96
+ if unit == "G" or unit == "b":
97
+ parameter_count *= 1000
98
+ elif unit == "M":
99
+ pass
100
+ elif unit == "k":
101
+ parameter_count /= 1000
102
+ else:
103
+ return None # Unknown
104
+
105
+ return parameter_count > threshold
106
+
107
+
108
  def validate_pytorch2(torch_version: str = None):
109
  torch_version = torch.__version__ if torch_version is None else torch_version
110
 
 
126
  )
127
 
128
 
129
+ def truncate_word_count(text, max_words=1024):
130
  """
131
  truncate_word_count - a helper function for the gradio module
132
  Parameters
 
221
  return text_html_block
222
 
223
 
224
+ def extract_batches(html_string, pattern=None, flags=None) -> list:
225
+ """
226
+ Extract batches of text from an HTML string.
227
+
228
+ Args:
229
+ html_string (str): The HTML string to extract batches from.
230
+ pattern (str, optional): The regular expression pattern to use. Defaults to a pattern that matches batches in the format provided.
231
+ flags (int, optional): The flags to use with the regular expression. Defaults to re.DOTALL.
232
+
233
+ Returns:
234
+ list: A list of dictionaries where each dictionary represents a batch and has 'title' and 'content' keys.
235
+ """
236
+ # Set default pattern if none provided
237
+ if pattern is None:
238
+ pattern = r'<h2 style="font-size: 22px; color: #555;">(.*?)</h2>\s*<p style="white-space: pre-line;">(.*?)</p>'
239
+
240
+ # Set default flags if none provided
241
+ if flags is None:
242
+ flags = re.DOTALL
243
+
244
+ try:
245
+ # Find all matches in the string
246
+ matches = re.findall(pattern, html_string, flags)
247
+
248
+ # Convert matches to a list of dictionaries
249
+ batches = [
250
+ {"title": title.strip(), "content": content.strip()}
251
+ for title, content in matches
252
+ ]
253
+
254
+ return batches
255
+ except re.error as e:
256
+ logging.error(f"An error occurred while trying to extract batches: {e}")
257
+ return []
258
+
259
+
260
  def extract_keywords(
261
  text: str, num_keywords: int = 3, window_size: int = 5, kw_max_len: int = 20
262
  ) -> List[str]: