Spaces:
Sleeping
Sleeping
| import re | |
| from bs4 import BeautifulSoup, Tag | |
| from smolagents import tool | |
| import requests | |
| import markdownify | |
| def get_wikipedia_article_sections(url: str) -> str: | |
| """ | |
| Visit a specific wikipedia article, and return a structured list of sections (table of contents) | |
| describing what information can be found on this page. | |
| Args: | |
| url (str): The url of the wikipedia article to visit. | |
| Returns: | |
| str: Table of contents, or error message. | |
| """ | |
| if "wikipedia." not in url: | |
| return f"the provided url does not appear to be a valid Wikipedia page." | |
| try: | |
| resp = requests.get(url) | |
| except Exception as e: | |
| return f"got an error: {str(e)}" | |
| if resp.status_code >= 400: | |
| return f"got an error (http status {resp.status_code}): {resp.text})" | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| def recurse_toc(ul, level=0): | |
| lines = [] | |
| for li in ul.find_all('li', recursive=False): | |
| a = li.find("a", class_="vector-toc-link") | |
| if a: | |
| href = a.get("href", "") | |
| if href.startswith("#") and href != "#": | |
| indent = " " * level | |
| lines.append(f"{indent}- {href[1:]}") | |
| # Recurse into child ULs | |
| child_ul = li.find("ul", recursive=False) | |
| if child_ul: | |
| lines.extend(recurse_toc(child_ul, level + 1)) | |
| return lines | |
| toc_ul = soup.find(id="mw-panel-toc-list") | |
| if toc_ul is None: | |
| return "could not extract table of contents; ensure that this is a valid link to a wikipedia article." | |
| links = recurse_toc(toc_ul) | |
| if not links: | |
| return "table of contents is empty or could not be parsed." | |
| toc_lines = ["Sections within article:"] | |
| for link in links: | |
| toc_lines.append(link) | |
| return "\n".join(toc_lines) | |
| def extract_wikipedia_section(url: str, section_id: str) -> str: | |
| """ | |
| Visit a specific wikipedia article, and return the specified section. You can get a list of available sections using get_wikipedia_article_sections | |
| Args: | |
| url (str): The url of the wikipedia article to visit. | |
| section_id (str): The id of the section to retrieve. | |
| Returns: | |
| str: the contents of the section. | |
| """ | |
| if "wikipedia." not in url: | |
| return f"the provided url does not appear to be a valid Wikipedia page." | |
| try: | |
| resp = requests.get(url) | |
| except Exception as e: | |
| return f"got an error: {str(e)}" | |
| if resp.status_code >= 400: | |
| return f"got an error (http status {resp.status_code}): {resp.text})" | |
| def get_classes(tag): | |
| if "class" not in tag.attrs: | |
| return [] | |
| return [str(c) for c in tag.get("class")] | |
| def get_heading_hierarchy(tag) -> int: | |
| classes = get_classes(tag) | |
| for c in classes: | |
| m = re.search("mw-heading(\d+)", c) | |
| if not m: | |
| continue | |
| return int(m.group(1)) | |
| resp = requests.get(url) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| section_id = section_id.strip("-").strip() | |
| # Find the element (usually an <h2>, <h3>, etc.) with this id in its child <span> | |
| heading = soup.find(lambda tag: tag.name in ['h1','h2','h3','h4','h5','h6'] and tag.get("id") == section_id) | |
| if not heading: | |
| return "the specified section could not be found on this page. try using get_wikipedia_article_sections to get a list of available sections." | |
| parent = heading.parent | |
| start_hierarchy = get_heading_hierarchy(parent) | |
| if start_hierarchy is None: | |
| return "failed to get section - this is likely a bug with the tool - the wikipedia page appears to have unexpected format" | |
| section_content = [] | |
| # Gather all siblings after the heading, stopping at the next heading of equal/higher rank | |
| for sibling in parent.next_siblings: | |
| if not isinstance(sibling, Tag): | |
| # this element is not an html tag, - probably just some text content | |
| section_content.append(str(sibling)) | |
| continue | |
| hierarchy = get_heading_hierarchy(sibling) | |
| if hierarchy is None: | |
| # this element is not a section header - add it. | |
| section_content.append(str(sibling)) | |
| continue | |
| if hierarchy > start_hierarchy: | |
| # this is lower in the hierarchy than the requested section, add it | |
| section_content.append(str(sibling)) | |
| continue | |
| break | |
| content_html = '\n'.join(section_content).strip() | |
| res = markdownify.markdownify(content_html) | |
| res = re.sub(r"\n{3,}", "\n\n", res) | |
| return res | |