Spaces:
Runtime error
Runtime error
| import tqdm | |
| from multiprocessing import Pool, cpu_count | |
| import signal | |
| import sys | |
| import time | |
| from flickrapi import FlickrAPI | |
| # Add Flickr configuration | |
| FLICKR_API_KEY = '80ef21a6f7eb0984ea613c316a89ca69' | |
| FLICKR_API_SECRET = '4d0e8ce6734f4b3f' | |
| flickr = FlickrAPI(FLICKR_API_KEY, FLICKR_API_SECRET, format='parsed-json', store_token=False) | |
| def get_photo_id(url): | |
| """Extract photo ID from Flickr URL""" | |
| try: | |
| return url.split('/')[-1].split('_')[0] | |
| except: | |
| return None | |
| def get_other_info(url): | |
| """Get author information from Flickr""" | |
| try: | |
| photo_id = get_photo_id(url) | |
| if photo_id: | |
| # wait for 0.1 second | |
| time.sleep(0.1) | |
| photo_info = flickr.photos.getInfo(photo_id=photo_id) | |
| license = photo_info['photo']['license'] | |
| owner = photo_info['photo']['owner'] | |
| flickr_url = f"https://www.flickr.com/photos/{owner.get('nsid', '')}/{photo_id}" | |
| return { | |
| 'username': owner.get('username', ''), | |
| 'realname': owner.get('realname', ''), | |
| 'nsid': owner.get('nsid', ''), | |
| 'flickr_url': flickr_url, | |
| 'license': license | |
| } | |
| except: | |
| pass | |
| return { | |
| 'username': 'Unknown', | |
| 'realname': 'Unknown', | |
| 'nsid': '', | |
| 'flickr_url': '', | |
| 'license': 'Unknown' | |
| } | |
| def init_worker(): | |
| """Initialize worker process to handle signals""" | |
| signal.signal(signal.SIGINT, signal.SIG_IGN) | |
| def process_url(url): | |
| try: | |
| return get_other_info(url) | |
| except Exception as e: | |
| return { | |
| 'username': 'Error', | |
| 'realname': str(e), | |
| 'nsid': '', | |
| 'flickr_url': url, | |
| 'license': 'Unknown' | |
| } | |
| def process_urls_in_chunks(urls, chunk_size=100000): | |
| authors = [] | |
| with Pool(cpu_count(), initializer=init_worker) as pool: | |
| try: | |
| # Process URLs in chunks | |
| for i in range(0, len(urls), chunk_size): | |
| chunk = urls[i:i + chunk_size] | |
| chunk_results = list(tqdm.tqdm( | |
| pool.imap(process_url, chunk), | |
| total=len(chunk), | |
| desc=f"Processing chunk {i//chunk_size + 1}" | |
| )) | |
| authors.extend(chunk_results) | |
| except KeyboardInterrupt: | |
| pool.terminate() | |
| pool.join() | |
| print("\nProcessing interrupted by user") | |
| sys.exit(1) | |
| return authors | |
| if __name__ == "__main__": | |
| urls_file = "data/openimages_urls.txt" | |
| with open(urls_file) as f: | |
| urls = [url.strip() for url in f.readlines()][:100000] | |
| authors = process_urls_in_chunks(urls) | |
| # Count unique authors | |
| unique_authors = len(set([author['username'] for author in authors])) | |
| print(f"unique_authors: {unique_authors}") | |
| print(f"Number of unique authors: {unique_authors}") | |