KevinHuSh commited on
Commit
389dac4
·
1 Parent(s): a505adc

Let task continue dispaching while meeting unexpected doc formats (#199)

Browse files

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

Issue link:#[[Link the issue
here](https://github.com/infiniflow/ragflow/issues/198)]

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Breaking Change (fix or feature that could cause existing
functionality not to work as expected)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Test cases
- [ ] Python SDK impacted, Need to update PyPI
- [ ] Other (please describe):

Files changed (1) hide show
  1. rag/svr/task_broker.py +40 -36
rag/svr/task_broker.py CHANGED
@@ -73,7 +73,7 @@ def dispatch():
73
  for t in tsks:
74
  TaskService.delete_by_id(t.id)
75
  except Exception as e:
76
- cron_logger.error("delete task exception:" + str(e))
77
 
78
  def new_task():
79
  nonlocal r
@@ -83,44 +83,48 @@ def dispatch():
83
  }
84
 
85
  tsks = []
86
- if r["type"] == FileType.PDF.value:
87
- do_layout = r["parser_config"].get("layout_recognize", True)
88
- pages = PdfParser.total_page_number(
89
- r["name"], MINIO.get(r["kb_id"], r["location"]))
90
- page_size = r["parser_config"].get("task_page_size", 12)
91
- if r["parser_id"] == "paper":
92
- page_size = r["parser_config"].get("task_page_size", 22)
93
- if r["parser_id"] == "one":
94
- page_size = 1000000000
95
- if not do_layout:
96
- page_size = 1000000000
97
- page_ranges = r["parser_config"].get("pages")
98
- if not page_ranges:
99
- page_ranges = [(1, 100000)]
100
- for s, e in page_ranges:
101
- s -= 1
102
- s = max(0, s)
103
- e = min(e - 1, pages)
104
- for p in range(s, e, page_size):
 
 
 
 
 
 
 
 
 
 
 
105
  task = new_task()
106
- task["from_page"] = p
107
- task["to_page"] = min(p + page_size, e)
108
  tsks.append(task)
 
 
 
 
 
 
 
109
 
110
- elif r["parser_id"] == "table":
111
- rn = HuExcelParser.row_number(
112
- r["name"], MINIO.get(
113
- r["kb_id"], r["location"]))
114
- for i in range(0, rn, 3000):
115
- task = new_task()
116
- task["from_page"] = i
117
- task["to_page"] = min(i + 3000, rn)
118
- tsks.append(task)
119
- else:
120
- tsks.append(new_task())
121
-
122
- bulk_insert_into_db(Task, tsks, True)
123
- set_dispatching(r["id"])
124
  tmf.write(str(r["update_time"]) + "\n")
125
  tmf.close()
126
 
 
73
  for t in tsks:
74
  TaskService.delete_by_id(t.id)
75
  except Exception as e:
76
+ cron_logger.exception(e)
77
 
78
  def new_task():
79
  nonlocal r
 
83
  }
84
 
85
  tsks = []
86
+ try:
87
+ if r["type"] == FileType.PDF.value:
88
+ do_layout = r["parser_config"].get("layout_recognize", True)
89
+ pages = PdfParser.total_page_number(
90
+ r["name"], MINIO.get(r["kb_id"], r["location"]))
91
+ page_size = r["parser_config"].get("task_page_size", 12)
92
+ if r["parser_id"] == "paper":
93
+ page_size = r["parser_config"].get("task_page_size", 22)
94
+ if r["parser_id"] == "one":
95
+ page_size = 1000000000
96
+ if not do_layout:
97
+ page_size = 1000000000
98
+ page_ranges = r["parser_config"].get("pages")
99
+ if not page_ranges:
100
+ page_ranges = [(1, 100000)]
101
+ for s, e in page_ranges:
102
+ s -= 1
103
+ s = max(0, s)
104
+ e = min(e - 1, pages)
105
+ for p in range(s, e, page_size):
106
+ task = new_task()
107
+ task["from_page"] = p
108
+ task["to_page"] = min(p + page_size, e)
109
+ tsks.append(task)
110
+
111
+ elif r["parser_id"] == "table":
112
+ rn = HuExcelParser.row_number(
113
+ r["name"], MINIO.get(
114
+ r["kb_id"], r["location"]))
115
+ for i in range(0, rn, 3000):
116
  task = new_task()
117
+ task["from_page"] = i
118
+ task["to_page"] = min(i + 3000, rn)
119
  tsks.append(task)
120
+ else:
121
+ tsks.append(new_task())
122
+
123
+ bulk_insert_into_db(Task, tsks, True)
124
+ set_dispatching(r["id"])
125
+ except Exception as e:
126
+ cron_logger.exception(e)
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  tmf.write(str(r["update_time"]) + "\n")
129
  tmf.close()
130