# # This script can be used for any purpose without limitation subject to the # conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx # # This permission notice and the following statement of attribution must be # included in all copies or substantial portions of this script. # # 2015-06-17: created by the Cambridge Crystallographic Data Centre # '''Get PubChem info for a CSD Entry This script will extract and print the PubChem data for a particular CSD entry if it is available. It makes use of: - CCDC to obtain molecule and display structure - RDKit to generate InChI Key - PubChem REST API to get data ''' from __future__ import division, absolute_import, print_function import sys import json import tempfile import requests import ccdc.io from ccdc.diagram import DiagramGenerator sys.path.append('C:\RDKit_2014_09_1') import rdkit.Chem csd_reader = ccdc.io.EntryReader('CSD') if len(sys.argv) != 3: print('Usage: ') sys.exit(1) refcode = sys.argv[1].upper() outfn = sys.argv[2] ##### Load the molecule from the CSD and convert to RDKit csd_mol = csd_reader.molecule(refcode) mol_block = csd_mol.to_string('sdf') rdkit_mol = rdkit.Chem.MolFromMolBlock(mol_block) if rdkit_mol is None: outfile = open(outfn, 'w') outfile.write("Unable to generate RDKit molecule for %s" % refcode) outfile.close() sys.exit(1) ##### Generate the InChI strings and keys inchi_str = rdkit.Chem.MolToInchi(rdkit_mol) inchi_key = rdkit.Chem.InchiToInchiKey(inchi_str) ##### Get the PubchemData # This is done by first using UniChem to get the PubChem ID from the InChI key. We can then do a REST call to get the PubChem data itself # # More info: # - UniChem - https://www.ebi.ac.uk/unichem/ # - PubChem PUG REST API - https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html # found = True unichem_req = 'https://www.ebi.ac.uk/unichem/rest/inchikey/%s' % inchi_key unichem_result = requests.get(unichem_req) if unichem_result.status_code != 200: outfile = open(outfn, 'w') outfile.write('No UniChem match found') outfile.close() sys.exit() for result in unichem_result.json(): # print result if result['src_id'] == '22': pubchem_id = result['src_compound_id'] break def get_pubchem_data(pubchem_data_object): """Get the data we want from the pubchem_data json object""" title = pubchem_data_object['Record']['Section'][2]['Section'][0]['Information'][0]['StringValue'] iupac_name = pubchem_data_object['Record']['Section'][2]['Section'][1]['Section'][0]['Information'][0][ 'StringValue'] synonyms = pubchem_data_object['Record']['Section'][2]['Section'][2]['Section'][0]['Information'][0][ 'StringValueList'] canon_smiles = pubchem_data_object['Record']['Section'][2]['Section'][1]['Section'][3]['Information'][0][ 'StringValue'] molweight = pubchem_data_object['Record']['Section'][3]['Section'][0]['Section'][0]['Information'][0]['NumValue'] formula = pubchem_data_object['Record']['Section'][3]['Section'][0]['Section'][1]['Information'][0]['StringValue'] return title, iupac_name, canon_smiles, synonyms, molweight, formula pubchem_req = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/%s/JSON/' % pubchem_id # print pubchem_req pubchem_result = requests.get(pubchem_req) if pubchem_result.status_code != 200: outfile = open(outfn, 'w') outfile.write('No PubChem match found') outfile.close() sys.exit() else: pubchem_json = json.loads(pubchem_result.text) pubchem_data = get_pubchem_data(pubchem_json) all_data = [refcode, pubchem_id] + list(pubchem_data) def formatted_output(data, imgfn): """Return formatted output""" html_string = '\n' html_string += '

CSD Refcode: %s

\n' % data[0] html_string += '

PubChem Info

\n' html_string += '\n' html_string += '\n' % imgfn html_string += '' return html_string def write_diagram(csd_mol): """write a mol diagram""" diagram_generator = DiagramGenerator() diagram_generator.settings.font_size = 10 diagram_generator.settings.line_width = 1.6 diagram_generator.settings.image_width = 250 diagram_generator.settings.image_height = 250 img = diagram_generator.image(csd_mol) # img is a PIL (Python Imaging Library) image tfileh, tfilen = tempfile.mkstemp(suffix='.png') img.save(tfilen, "PNG") return tfilen imgfn = write_diagram(csd_mol) outfile = open(outfn, 'w') outfile.write(formatted_output(all_data, imgfn)) outfile.close()