doi resolver
Helper Class providing multiple static functions to extract a doi from a url
check_doi_list_valid(potential_dois)
check if a list of potential dois are valid and if so return the valid doi
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
potential_dois |
a list of dois that should be checked |
required |
Source code in src/doi_resolver.py
def check_doi_list_valid(potential_dois):
"""check if a list of potential dois are valid and if so return the valid doi
Arguments:
potential_dois: a list of dois that should be checked
"""
pdoi = ''
for doi in potential_dois:
if doi is not None and doi:
pdoi = pdoi + doi + ','
pre = "http://doi.org/doiRA/"
rw = False
if pdoi != '':
r = requests.get(pre + pdoi)
json_response = r.json()
for j in json_response:
if 'RA' in j:
rw = j['DOI']
return rw
crossref_url_search(url)
search the url in crossref eventdata to get a doi
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
url |
the url to get a doi for |
required |
Source code in src/doi_resolver.py
def crossref_url_search(url):
"""search the url in crossref eventdata to get a doi
Arguments:
url: the url to get a doi for
"""
r = requests.get("http://api.eventdata.crossref.org/v1/events?rows=1&obj.url=" + url)
if r.status_code == 200:
json_response = r.json()
if 'status' in json_response:
if json_response['status'] == 'ok':
# logging.debug(json_response)
if 'message' in json_response:
if 'events' in json_response['message']:
for event in json_response['message']['events']:
return event['obj_id'][16:] # https://doi.org/ -> 16
return False
get_dois_regex(regex, temp_doi)
find all occurrences of a given regex in a doi-string which ending is searched
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
regex |
the regex to look for |
required | |
temp_doi |
the doi to use |
required |
Source code in src/doi_resolver.py
def get_dois_regex(regex, temp_doi):
"""find all occurrences of a given regex in a doi-string which ending is searched
Arguments:
regex: the regex to look for
temp_doi: the doi to use
"""
if regex.search(temp_doi) is not None:
return regex.findall(temp_doi)[0]
get_filtered_dois_from_meta(potential_dois)
check potential dois from meta tags for dois to extract them in case they have a full url
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
potential_dois |
the dois to filter |
required |
Source code in src/doi_resolver.py
def get_filtered_dois_from_meta(potential_dois):
"""check potential dois from meta tags for dois to extract them in case they have a full url
Arguments:
potential_dois: the dois to filter
"""
result = set([])
for t in potential_dois:
result.add(t.replace('doi:', ''))
doi_re = re.compile("(10\\.\\d{4,9}(?:/|%2F|%2f)[^\\s]+)")
r = set([])
for t in result:
if doi_re.search(t) is not None:
r.add(t)
return r
get_lxml(page)
use lxml to search for meta tags that could contain the doi
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
page |
the page to search |
required |
Source code in src/doi_resolver.py
def get_lxml(page):
"""use lxml to search for meta tags that could contain the doi
Arguments:
page: the page to search
"""
content = html.fromstring(page.content)
result = set([])
for meta in content.xpath('//meta'):
for name, value in sorted(meta.items()):
if value.strip().lower() in ['citation_doi', 'dc.identifier', 'evt-doipage', 'news_doi', 'citation_doi',
'doi', 'DC.DOI', 'DC.Identifier.DOI', 'DOIs',
'bepress_citation_doi', 'rft_id']:
result.add(meta.get('content'))
return result
get_potential_dois_from_text(text)
use multiple different regexes to get a list of potential dois, it uses a very generic regex first which only checks for the bare minimum start to the end of line, this result will than be searched for possible endings generating possible dois the result is a set and will likely contain unvalid dois
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text |
the text which should be checked |
required |
Source code in src/doi_resolver.py
def get_potential_dois_from_text(text):
"""use multiple different regexes to get a list of potential dois,
it uses a very generic regex first which only checks for the bare minimum start to the end of line,
this result will than be searched for possible endings generating possible dois
the result is a set and will likely contain unvalid dois
Arguments:
text: the text which should be checked
"""
doi_re = re.compile("(10\\.\\d{4,9}(?:/|%2F|%2f)[^\\s]+)")
last_slash = re.compile("^(10\\.\\d+/.*)/.*$")
first_slash = re.compile("^(10\\.\\d+/.*?)/.*$")
semicolon = re.compile("^(10\\.\\d+/.*);.*$")
hashchar = re.compile("^(10\\.\\d+/.*?)#.*$")
question_mark = re.compile("^(10\\.\\d+/.*?)\\?.*$")
amp_mark = re.compile("^(10\\.\\d+/.*?)&.*$")
v1 = re.compile("^(10\\.\\d+/.*)v1*$") # biorxiv, make if v\\d+ (v and digit v1,v2,v3
result = set([])
if doi_re.search(text) is not None:
temp_doi = doi_re.search(text).group()
result.add(temp_doi)
result.add(get_dois_regex(last_slash, temp_doi))
result.add(get_dois_regex(first_slash, temp_doi))
result.add(get_dois_regex(semicolon, temp_doi))
result.add(get_dois_regex(hashchar, temp_doi))
result.add(get_dois_regex(question_mark, temp_doi))
result.add(get_dois_regex(amp_mark, temp_doi))
result.add(get_dois_regex(v1, temp_doi))
return result
get_response(url, s, r=0)
get a response from a given url using a given session s, a session can be used for headers, this function is cached up to 100 elements
!!! arguments
url: the url to get
s: the session to use
Source code in src/doi_resolver.py
@lru_cache(maxsize=100)
def get_response(url, s, r=0):
"""get a response from a given url using a given session s, a session can be used for headers,
this function is cached up to 100 elements
Arguments:
url: the url to get
s: the session to use
"""
try:
url = url.replace('arxiv.org', 'export.arxiv.org') # arxiv wants this url to be used by machines
result = s.get(url, stream=False, timeout=5)
except (ConnectionRefusedError, SSLError, ReadTimeoutError, requests.exceptions.TooManyRedirects,
requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout, NewConnectionError, requests.exceptions.SSLError, ConnectionError):
logging.warning('Percolator error, reset session')
s = Session()
# get the response for the provided url
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.104 Safari/537.36'
}
s.headers.update(headers)
if r < 3:
logging.warning('retry in ' + str(pow(2, r)) + 's')
time.sleep(pow(2, r))
get_response(url, s, r + 1)
else:
return None
else:
return result
return None
link_url(url)
link a url to a valid doi, it will try to get potential dois using multiple regex and than check if their are valid and than return the doi it uses multiple methods to search for the doi, this function is cached up to 10000 elements
!!! arguments
url: the url to get
s: the session to use
Source code in src/doi_resolver.py
@lru_cache(maxsize=10000)
def link_url(url):
"""link a url to a valid doi,
it will try to get potential dois using multiple regex and than check if their are valid and than return the doi
it uses multiple methods to search for the doi,
this function is cached up to 10000 elements
Arguments:
url: the url to get
s: the session to use
"""
# logging.warning(url)
# check if the url contains the doi
doi = check_doi_list_valid(get_potential_dois_from_text(url))
if doi:
logging.debug('url')
return doi
s = Session()
# get the response for the provided url
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.104 Safari/537.36'
}
s.headers.update(headers)
r = get_response(url, s)
# check if the doi is in any meta tag
if r:
pot_doi = get_lxml(r)
doi = check_doi_list_valid(get_filtered_dois_from_meta(pot_doi))
if doi and doi != set([]):
logging.debug('meta')
return doi
# check if crossref knows this url and returns the doi
doi = crossref_url_search(url)
if doi:
logging.debug('crossref')
return doi
if r:
# do a fulltext search of the url
doi = check_doi_list_valid(search_fulltext(r))
if doi:
logging.debug('fulltext')
return doi
return False
search_fulltext(r)
search the fulltext of an response
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
r |
the response we want to search |
required |
Source code in src/doi_resolver.py
def search_fulltext(r):
"""search the fulltext of an response
Arguments:
r: the response we want to search
"""
return get_potential_dois_from_text(r.text)
url_doi_check(data)
check data for urls, get first url in urls, prefer expanded_url
!!! arguments data: data to get url from
Source code in src/doi_resolver.py
def url_doi_check(data):
"""check data for urls,
get first url in urls,
prefer expanded_url
Arguments:
data: data to get url from
"""
doi_data = False
if 'entities' in data:
if 'urls' in data['entities']:
for url in data['entities']['urls']:
if doi_data is False and 'expanded_url' in url:
doi_data = link_url(url['expanded_url'])
if doi_data is False and 'unwound_url' in url:
doi_data = link_url(url['unwound_url'])
if doi_data is not False:
logging.debug(doi_data)
return doi_data
if doi_data is not False:
logging.debug(doi_data)
return doi_data
return doi_data