diff --git a/creation_datasets/README.txt b/creation_datasets/README.txt index f53c062093b9fcc890226c422abbb7ce3db13f2a..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/creation_datasets/README.txt +++ b/creation_datasets/README.txt @@ -1,2 +0,0 @@ -Pour télécharger les fichiers audio nécessaires au test des modèles américains : lancer le fichier api_download_test_america.py, les fichiers sont enregistrés dans un dossier ./creation_dataset/audio_files_test -Pour télécharger les fichiers audio nécessaires au réentraînement du modèle sur les oiseaux européens : lancer le fichier api_download_train.py, les fichiers sont enregistrés dans un dossier ./creation_dataset/audio_files_france_train diff --git a/creation_datasets/api_data_test_america.py b/creation_datasets/api_data_test_america.py deleted file mode 100644 index f70e7f040ce95533cd3db4d8e854e48332f31268..0000000000000000000000000000000000000000 --- a/creation_datasets/api_data_test_america.py +++ /dev/null @@ -1,31 +0,0 @@ -import pandas as pd -import requests -from tqdm import tqdm - -# crée un fichier csv avec le lien vers 5 fichiers audio pour chaque espèce considérée (espèces de la base de donnée d'entraînement des modèles à tester) - -species = ["American Bittern", "American Crow", "American Goldfinch", "American Kestrel", "Buff-bellied Pipit", "American Redstart", "American Robin", "American Wigeon", "American Woodcock", "American Tree Sparrow", "Anna's Hummingbird", "Ash-throated Flycatcher", "Baird's Sandpiper", "Bald Eagle", "Baltimore Oriole", "Sand Martin", "Barn Swallow", "Black-and-white Warbler", "Belted Kingfisher", "Bell's Sparrow", "Bewick's Wren", "Black-billed Cuckoo", "Black-billed Magpie", "Blackburnian Warbler", "Black-capped Chickadee", "Black-chinned Hummingbird", "Black-headed Grosbeak", "Blackpoll Warbler", "Black-throated Sparrow", "Black Phoebe", "Blue Grosbeak", "Blue Jay", "Brown-headed Cowbird", "Bobolink", "Bonaparte's Gull", "Barred Owl", "Brewer's Blackbird", "Brewer's Sparrow", "Brown Creeper", "Brown Thrasher", "Broad-tailed Hummingbird", "Broad-winged Hawk", "Black-throated Blue Warbler", "Black-throated Green Warbler", "Black-throated Grey Warbler", "Bufflehead", "Blue-grey Gnatcatcher", "Blue-headed Vireo", "Bullock's Oriole", "American Bushtit", "Blue-winged Teal", "Blue-winged Warbler", "Cactus Wren", "California Gull", "California Quail", "Cape May Warbler", "Canada Goose", "Canada Warbler", "Canyon Wren", "Carolina Wren", "Cassin's Finch", "Caspian Tern", "Cassin's Vireo", "Cedar Waxwing", "Chipping Sparrow", "Chimney Swift", "Chestnut-sided Warbler", "Chukar Partridge", "Clark's Nutcracker", "American Cliff Swallow", "Common Goldeneye", "Common Grackle", "Common Loon", "Common Merganser", "Common Nighthawk", "Northern Raven", "Common Redpoll", "Common Tern", "Common Yellowthroat", "Cooper's Hawk", "Costa's Hummingbird", "California Scrub Jay", "Dark-eyed Junco", "Double-crested Cormorant", "Downy Woodpecker", "American Dusky Flycatcher", "Black-necked Grebe", "Eastern Bluebird", "Eastern Kingbird", "Eastern Meadowlark", "Eastern Phoebe", "Eastern Towhee", "Eastern Wood Pewee", "Eurasian Collared Dove", "Common Starling", "Evening Grosbeak", "Field Sparrow", "Fish Crow", "Red Fox Sparrow", "Gadwall", "Grey-crowned Rosy Finch", "Green-tailed Towhee", "Eurasian Teal", "Golden-crowned Kinglet", "Golden-crowned Sparrow", "Golden Eagle", "Great Blue Heron", "Great Crested Flycatcher", "Great Egret", "Greater Roadrunner", "Greater Yellowlegs", "Great Horned Owl", "Green Heron", "Great-tailed Grackle", "Grey Catbird", "American Grey Flycatcher", "Hairy Woodpecker", "Hammond's Flycatcher", "European Herring Gull", "Hermit Thrush", "Hooded Merganser", "Hooded Warbler", "Horned Grebe", "Horned Lark", "House Finch", "House Sparrow", "House Wren", "Indigo Bunting", "Juniper Titmouse", "Killdeer", - "Ladder-backed Woodpecker", "Lark Sparrow", "Lazuli Bunting", "Least Bittern", "Least Flycatcher", "Least Sandpiper", "LeConte's Thrasher", "Lesser Goldfinch", "Lesser Nighthawk", "Lesser Yellowlegs", "Lewis' Woodpecker", "Lincoln's Sparrow", "Long-billed Curlew", "Long-billed Dowitcher", "Loggerhead Shrike", "Long-tailed Duck", "Louisiana Waterthrush", "MacGillivray's Warbler", "Magnolia Warbler", "Mallard", "Marsh Wren", "Merlin", "Mountain Bluebird", "Mountain Chickadee", "Mourning Dove", "Northern Cardinal", "Northern Flicker", "Northern Harrier", "Northern Mockingbird", "Northern Parula", "Northern Pintail", "Northern Shoveler", "Northern Waterthrush", "Northern Rough-winged Swallow", "Nuttall's Woodpecker", "Olive-sided Flycatcher", "Orange-crowned Warbler", "Western Osprey", "Ovenbird", "Palm Warbler", "Pacific-slope Flycatcher", "Pectoral Sandpiper", "Peregrine Falcon", "Phainopepla", "Pied-billed Grebe", "Pileated Woodpecker", "Pine Grosbeak", "Pinyon Jay", "Pine Siskin", "Pine Warbler", "Plumbeous Vireo", "Prairie Warbler", "Purple Finch", "Pygmy Nuthatch", "Red-breasted Merganser", "Red-breasted Nuthatch", "Red-breasted Sapsucker", "Red-bellied Woodpecker", "Red Crossbill", "Redhead", "Red-eyed Vireo", "Red-necked Phalarope", "Red-shouldered Hawk", "Red-tailed Hawk", "Red-winged Blackbird", "Ring-billed Gull", "Ring-necked Duck", "Rose-breasted Grosbeak", "Rock Dove", "Rock Wren", "Ruby-throated Hummingbird", "Ruby-crowned Kinglet", "Ruddy Duck", "Ruffed Grouse", "Rufous Hummingbird", "Rusty Blackbird", "Sagebrush Sparrow", "Sage Thrasher", "Savannah Sparrow", "Say's Phoebe", "Scarlet Tanager", "Scott's Oriole", "Semipalmated Plover", "Semipalmated Sandpiper", "Short-eared Owl", "Sharp-shinned Hawk", "Snow Bunting", "Snow Goose", "Solitary Sandpiper", "Song Sparrow", "Sora", "Spotted Sandpiper", "Spotted Towhee", "Steller's Jay", "Swainson's Hawk", "Swamp Sparrow", "Swainson's Thrush", "Tree Swallow", "Trumpeter Swan", "Tufted Titmouse", "Tundra Swan", "Veery", "Vesper Sparrow", "Violet-green Swallow", "Warbling Vireo", "Western Bluebird", "Western Grebe", "Western Kingbird", "Western Meadowlark", "Western Sandpiper", "Western Tanager", "Western Wood Pewee", "White-breasted Nuthatch", "White-crowned Sparrow", "White-faced Ibis", "White-throated Sparrow", "White-throated Swift", "Willow Flycatcher", "Wilson's Snipe", "Wild Turkey", "Winter Wren", "Wilson's Warbler", "Wood Duck", "Woodhouse's Scrub Jay", "Wood Thrush", "American Coot", "Yellow-bellied Flycatcher", "Yellow-bellied Sapsucker", "Yellow-headed Blackbird", "Mangrove Warbler", "Myrtle Warbler", "Yellow-throated Vireo"] - -df = pd.DataFrame() -for specie in species: - i = 0 - print(specie) - response = requests.get( - f'https://xeno-canto.org/api/2/recordings?query={specie}+since:2020-11-11') - js = response.json() - ids, files, extensions, ens, lengths, gens = [], [], [], [], [], [] - for recording in js["recordings"]: - if i < 5: - ids.append(recording["id"]) - files.append(recording["file"]) - extensions.append(recording["file-name"][-4:].lower()) - ens.append(recording["en"]) - lengths.append(recording["length"]) - gens.append(recording["gen"]) - i += 1 - df_ = pd.DataFrame.from_records( - {'id': ids, "file": files, "extension": extensions, "en": ens, "gen": gens, "length": lengths}) - df = df.append(df_) - -df.to_csv("creation_datasets/fichiers_csv/birds_anerica.csv", index=False) diff --git a/creation_datasets/api_data_test_france.py b/creation_datasets/api_data_test_france.py deleted file mode 100644 index 1fa3629ff34b424de6b728b9c35d5adc737eb279..0000000000000000000000000000000000000000 --- a/creation_datasets/api_data_test_france.py +++ /dev/null @@ -1,33 +0,0 @@ -import pandas as pd -import requests -from tqdm import tqdm - -# crée un fichier csv (birds_france_test.csv) avec le lien vers 5 fichiers audio des espèces considérées pour l'entraînement du modèle français - -species = {'Black Redstart': 0, 'Black Woodpecker': 1, 'Black-crowned Night Heron': 2, 'Brambling': 3, 'Carrion Crow': 4, "Cetti's Warbler": 5, 'Cirl Bunting': 6, 'Coal Tit': 7, 'Common Blackbird': 8, 'Common Chaffinch': 9, 'Common Chiffchaff': 10, 'Common Crane': 11, 'Common Cuckoo': 12, 'Common Firecrest': 13, 'Common Linnet': 14, 'Common Moorhen': 15, 'Common Nightingale': 16, 'Common Redstart': 17, 'Common Reed Bunting': 18, 'Common Ringed Plover': 19, 'Common Sandpiper': 20, 'Common Snipe': 21, 'Common Starling': 22, 'Common Whitethroat': 23, 'Corn Bunting': 24, 'Dunlin': 25, 'Dunnock': 26, 'Eurasian Blackcap': 27, 'Eurasian Blue Tit': 28, 'Eurasian Bullfinch': 29, 'Eurasian Coot': 30, 'Eurasian Curlew': 31, 'Eurasian Eagle-Owl': 32, 'Eurasian Golden Oriole': 33, 'Eurasian Jay': 34, 'Eurasian Nuthatch': 35, 'Eurasian Reed Warbler': 36, 'Eurasian Skylark': 37, 'Eurasian Stone-curlew': 38, 'Eurasian Treecreeper': 39, 'Eurasian Wren': 40, 'European Crested Tit': 41, 'European Goldfinch': 42, 'European Green Woodpecker': 43, - 'European Greenfinch': 44, 'European Nightjar': 45, 'European Pied Flycatcher': 46, 'European Robin': 47, 'European Serin': 48, 'Garden Warbler': 49, 'Goldcrest': 50, 'Great Spotted Woodpecker': 51, 'Great Tit': 52, 'Green Sandpiper': 53, 'Grey Heron': 54, 'House Sparrow': 55, 'Little Bittern': 56, 'Little Grebe': 57, 'Little Owl': 58, 'Long-eared Owl': 59, 'Long-tailed Tit': 60, 'Marsh Tit': 61, 'Marsh Warbler': 62, 'Meadow Pipit': 63, 'Melodious Warbler': 64, 'Middle Spotted Woodpecker': 65, 'Mistle Thrush': 66, 'Ortolan Bunting': 67, 'Red Crossbill': 68, 'Redwing': 69, 'Ring Ouzel': 70, 'Sardinian Warbler': 71, 'Short-toed Treecreeper': 72, 'Song Thrush': 73, 'Spotted Crake': 74, 'Spotted Flycatcher': 75, 'Subalpine Warbler': 76, 'Tawny Owl': 77, 'Tawny Pipit': 78, 'Tree Pipit': 79, 'Water Pipit': 80, 'Water Rail': 81, 'Western Barn Owl': 82, "Western Bonelli's Warbler": 83, 'Western Jackdaw': 84, 'Western Yellow Wagtail': 85, 'White Wagtail': 86, 'Willow Warbler': 87, 'Wood Warbler': 88, 'Woodlark': 89, 'Yellowhammer': 90} -liste_species = species.keys() -print(liste_species) - -df = pd.DataFrame() -for specie in liste_species: - i = 0 - print(specie) - response = requests.get( - f'https://xeno-canto.org/api/2/recordings?query={specie}+since:2021-11-11') - js = response.json() - ids, files, extensions, ens, lengths, gens = [], [], [], [], [], [] - for recording in js["recordings"]: - if i < 5 and not pd.isnull(recording["file-name"]): - ids.append(recording["id"]) - files.append(recording["file"]) - extensions.append(recording["file-name"][-4:].lower()) - ens.append(recording["en"]) - lengths.append(recording["length"]) - gens.append(recording["gen"]) - i += 1 - df_ = pd.DataFrame.from_records( - {'id': ids, "file": files, "extension": extensions, "en": ens, "gen": gens, "length": lengths}) - df = df.append(df_) - -df.to_csv("creation_datasets/fichiers_csv/birds_france_test.csv", index=False) diff --git a/creation_datasets/api_data_train.py b/creation_datasets/api_data_train.py deleted file mode 100644 index 720daaf14a6e9c0d9ea097fe0cdbaccf563a9bdb..0000000000000000000000000000000000000000 --- a/creation_datasets/api_data_train.py +++ /dev/null @@ -1,50 +0,0 @@ -import pandas as pd -import requests -from tqdm import tqdm - -# crée un fichier csv avec le lien vers tous les fichiers audio enregistrés en France, pour pouvoir réentraîner sur un dataset français le modèle sélectionné qui fonctionnait bien sur les oiseaux européens - -# species = ["Alder Flycatcher", "American Avocet", "American Bittern", "American Crow", "American Goldfinch", "American Kestrel", "Buff-bellied Pipit", "American Redstart", "American Robin", "American Wigeon", "American Woodcock", "American Tree Sparrow", "Anna's Hummingbird", "Ash-throated Flycatcher", "Baird's Sandpiper", "Bald Eagle", "Baltimore Oriole", "Sand Martin", "Barn Swallow", "Black-and-white Warbler", "Belted Kingfisher", "Bell's Sparrow", "Bewick's Wren", "Black-billed Cuckoo", "Black-billed Magpie", "Blackburnian Warbler", "Black-capped Chickadee", "Black-chinned Hummingbird", "Black-headed Grosbeak", "Blackpoll Warbler", "Black-throated Sparrow", "Black Phoebe", "Blue Grosbeak", "Blue Jay", "Brown-headed Cowbird", "Bobolink", "Bonaparte's Gull", "Barred Owl", "Brewer's Blackbird", "Brewer's Sparrow", "Brown Creeper", "Brown Thrasher", "Broad-tailed Hummingbird", "Broad-winged Hawk", "Black-throated Blue Warbler", "Black-throated Green Warbler", "Black-throated Grey Warbler", "Bufflehead", "Blue-grey Gnatcatcher", "Blue-headed Vireo", "Bullock's Oriole", "American Bushtit", "Blue-winged Teal", "Blue-winged Warbler", "Cactus Wren", "California Gull", "California Quail", "Cape May Warbler", "Canada Goose", "Canada Warbler", "Canyon Wren", "Carolina Wren", "Cassin's Finch", "Caspian Tern", "Cassin's Vireo", "Cedar Waxwing", "Chipping Sparrow", "Chimney Swift", "Chestnut-sided Warbler", "Chukar Partridge", "Clark's Nutcracker", "American Cliff Swallow", "Common Goldeneye", "Common Grackle", "Common Loon", "Common Merganser", "Common Nighthawk", "Northern Raven", "Common Redpoll", "Common Tern", "Common Yellowthroat", "Cooper's Hawk", "Costa's Hummingbird", "California Scrub Jay", "Dark-eyed Junco", "Double-crested Cormorant", "Downy Woodpecker", "American Dusky Flycatcher", "Black-necked Grebe", "Eastern Bluebird", "Eastern Kingbird", "Eastern Meadowlark", "Eastern Phoebe", "Eastern Towhee", "Eastern Wood Pewee", "Eurasian Collared Dove", "Common Starling", "Evening Grosbeak", "Field Sparrow", "Fish Crow", "Red Fox Sparrow", "Gadwall", "Grey-crowned Rosy Finch", "Green-tailed Towhee", "Eurasian Teal", "Golden-crowned Kinglet", "Golden-crowned Sparrow", "Golden Eagle", "Great Blue Heron", "Great Crested Flycatcher", "Great Egret", "Greater Roadrunner", "Greater Yellowlegs", "Great Horned Owl", "Green Heron", "Great-tailed Grackle", "Grey Catbird", "American Grey Flycatcher", "Hairy Woodpecker", "Hammond's Flycatcher", "European Herring Gull", "Hermit Thrush", "Hooded Merganser", "Hooded Warbler", "Horned Grebe", "Horned Lark", "House Finch", "House Sparrow", "House Wren", "Indigo Bunting", "Juniper Titmouse", -# "Killdeer", "Ladder-backed Woodpecker", "Lark Sparrow", "Lazuli Bunting", "Least Bittern", "Least Flycatcher", "Least Sandpiper", "LeConte's Thrasher", "Lesser Goldfinch", "Lesser Nighthawk", "Lesser Yellowlegs", "Lewis' Woodpecker", "Lincoln's Sparrow", "Long-billed Curlew", "Long-billed Dowitcher", "Loggerhead Shrike", "Long-tailed Duck", "Louisiana Waterthrush", "MacGillivray's Warbler", "Magnolia Warbler", "Mallard", "Marsh Wren", "Merlin", "Mountain Bluebird", "Mountain Chickadee", "Mourning Dove", "Northern Cardinal", "Northern Flicker", "Northern Harrier", "Northern Mockingbird", "Northern Parula", "Northern Pintail", "Northern Shoveler", "Northern Waterthrush", "Northern Rough-winged Swallow", "Nuttall's Woodpecker", "Olive-sided Flycatcher", "Orange-crowned Warbler", "Western Osprey", "Ovenbird", "Palm Warbler", "Pacific-slope Flycatcher", "Pectoral Sandpiper", "Peregrine Falcon", "Phainopepla", "Pied-billed Grebe", "Pileated Woodpecker", "Pine Grosbeak", "Pinyon Jay", "Pine Siskin", "Pine Warbler", "Plumbeous Vireo", "Prairie Warbler", "Purple Finch", "Pygmy Nuthatch", "Red-breasted Merganser", "Red-breasted Nuthatch", "Red-breasted Sapsucker", "Red-bellied Woodpecker", "Red Crossbill", "Redhead", "Red-eyed Vireo", "Red-necked Phalarope", "Red-shouldered Hawk", "Red-tailed Hawk", "Red-winged Blackbird", "Ring-billed Gull", "Ring-necked Duck", "Rose-breasted Grosbeak", "Rock Dove", "Rock Wren", "Ruby-throated Hummingbird", "Ruby-crowned Kinglet", "Ruddy Duck", "Ruffed Grouse", "Rufous Hummingbird", "Rusty Blackbird", "Sagebrush Sparrow", "Sage Thrasher", "Savannah Sparrow", "Say's Phoebe", "Scarlet Tanager", "Scott's Oriole", "Semipalmated Plover", "Semipalmated Sandpiper", "Short-eared Owl", "Sharp-shinned Hawk", "Snow Bunting", "Snow Goose", "Solitary Sandpiper", "Song Sparrow", "Sora", "Spotted Sandpiper", "Spotted Towhee", "Steller's Jay", "Swainson's Hawk", "Swamp Sparrow", "Swainson's Thrush", "Tree Swallow", "Trumpeter Swan", "Tufted Titmouse", "Tundra Swan", "Veery", "Vesper Sparrow", "Violet-green Swallow", "Warbling Vireo", "Western Bluebird", "Western Grebe", "Western Kingbird", "Western Meadowlark", "Western Sandpiper", "Western Tanager", "Western Wood Pewee", "White-breasted Nuthatch", "White-crowned Sparrow", "White-faced Ibis", "White-throated Sparrow", "White-throated Swift", "Willow Flycatcher", "Wilson's Snipe", "Wild Turkey", "Winter Wren", "Wilson's Warbler", "Wood Duck", "Woodhouse's Scrub Jay", "Wood Thrush", "American Coot", "Yellow-bellied Flycatcher", "Yellow-bellied Sapsucker", "Yellow-headed Blackbird", "Mangrove Warbler", "Myrtle Warbler", "Yellow-throated Vireo"] -# -# countries = ["France", "Spain", "Belgium", "Luxembourg", "Italy", "Norway", "Sweden", -# "Germany", "Netherlands", "Denmark", "Croatia", "Hungary", "Greece", "Poland", "United Kingdom", "Romania", "Estonia", "Finland", "Portugal", "Ukraine"] - -country = ["France"] - - -df = pd.DataFrame() -for country in country: - i = 0 - print(country) - response = requests.get( - f'https://xeno-canto.org/api/2/recordings?query=cnt:{country}') - js = response.json() - ids, files, extensions, ens, lengths, gens, cnts = [], [], [], [], [], [], [] - for n_page in tqdm(range(1, js["numPages"]+1)): - response = requests.get( - f'https://xeno-canto.org/api/2/recordings?query=cnt:{country}&page={n_page}') - page_js = response.json() - for recording in page_js["recordings"]: - if not pd.isnull(recording["file-name"]): - ids.append(recording["id"]) - files.append(recording["file"]) - extensions.append(recording["file-name"][-4:].lower()) - ens.append(recording["en"]) - lengths.append(recording["length"]) - gens.append(recording["gen"]) - cnts.append(recording["cnt"]) - df_ = pd.DataFrame.from_records( - {'id': ids, "file": files, "extension": extensions, "en": ens, "gen": gens, "length": lengths, "cnt": cnts}) - df = df.append(df_) - -df = df[df['en'] != 'Identity unknown'].copy() -df = df[df['en'] != 'Soundscape'].copy() - -counts = df['en'].value_counts() -print(counts) - -chosen = counts[counts >= 150].index -df = df[df["en"].isin(chosen)] - -df.to_csv("creation_datasets/fichiers_csv/birds_france_train.csv", index=False) diff --git a/creation_datasets/api_download_test_america.py b/creation_datasets/api_download_test_america.py deleted file mode 100644 index 9430c8694ac5bc171e014fbf855ea66d9aa72d61..0000000000000000000000000000000000000000 --- a/creation_datasets/api_download_test_america.py +++ /dev/null @@ -1,58 +0,0 @@ -from pandas import read_csv -from urllib.request import urlretrieve -from os import listdir, mkdir, makedirs, path as os_path -from tqdm import tqdm -from time import sleep -import requests - -# download all the files from the file csv birds.csv - -df = read_csv("creation_datasets/fichiers_csv/birds_america.csv") - -# Removing recordings that are not classified -df = df[df['en'] != 'Identity unknown'].copy() -df = df[df['en'] != 'Soundscape'].copy() -counts = df['en'].value_counts() -print(counts) - - -print(len(df["en"].unique()), "different species") - -makedirs('creation_datasets/audio_files_test_america', exist_ok='True') -audio_dir = 'creation_datasets/audio_files_test_america/' - -nots = [] - -for row in tqdm(df.iterrows(), total=df.shape[0]): - url = row[1]["file"] - print(url) - - f_name = audio_dir + str(row[1]["id"]) + row[1]["extension"] - print(f_name) - - try: - #urlretrieve('https:'+url, filename=audio_dir + f_name) - response = requests.get(url) - open(f_name, 'wb').write(response.content) - - except Exception as e: - print("\nRetrying:", url) - print(e) - sleep(1) - try: - #urlretrieve('https:'+url, filename=audio_dir + f_name) - response = requests.get(url) - open(f_name, 'wb').write(response.content) - except Exception as ee: - print("Not downloaded|", f_name) - print(ee) - nots.append(row[1]["id"]) - pass - -if len(nots) > 0: - with open('not_downloaded.txt', 'w') as f: - for item in nots: - f.write(str(item) + '\n') - print(str(nots)) -else: - print('All files were successfully downloaded!') diff --git a/creation_datasets/api_download_test_france.py b/creation_datasets/api_download_test_france.py deleted file mode 100644 index 2cd8a01b156f5f51e9e8f288286c9d49ac519c86..0000000000000000000000000000000000000000 --- a/creation_datasets/api_download_test_france.py +++ /dev/null @@ -1,62 +0,0 @@ -from pandas import read_csv -from urllib.request import urlretrieve -from os import listdir, mkdir, makedirs, path as os_path -from tqdm import tqdm -from time import sleep -import requests -import pandas as pd - -# télécharge les fichiers audio du fichier csv birds_france_test.csv - -df = read_csv("creation_datasets/fichiers_csv/birds_france_test.csv") - -# Removing recordings that are not classified -df = df[df['en'] != 'Identity unknown'].copy() -df = df[df['en'] != 'Soundscape'].copy() -counts = df['en'].value_counts() -print(counts) - -print(len(df["en"].unique()), "different species") - - -makedirs('creation_datasets/audio_files_test_france', exist_ok='True') -audio_dir = 'creation_datasets/audio_files_test_france/' - -nombre_audio = {} -for specie in df["en"].unique(): - nombre_audio[specie] = 0 -print(nombre_audio) - - -nots = [] - -for row in tqdm(df.iterrows(), total=df.shape[0]): - url = row[1]["file"] - print(url) - - f_name = audio_dir + str(row[1]["id"]) + \ - row[1]["extension"] - - try: - response = requests.get(url) - open(f_name, 'wb').write(response.content) - except Exception as e: - print("\nRetrying:", url) - print(e) - sleep(1) - try: - response = requests.get(url) - open(f_name, 'wb').write(response.content) - except Exception as ee: - print("Not downloaded|", f_name) - print(ee) - nots.append(row[1]["id"]) - pass - -if len(nots) > 0: - with open('not_downloaded.txt', 'w') as f: - for item in nots: - f.write(str(item) + '\n') - print(str(nots)) -else: - print('All files were successfully downloaded!') diff --git a/creation_datasets/create_csv.py b/creation_datasets/create_csv.py new file mode 100644 index 0000000000000000000000000000000000000000..6f664c6e86a4c9c55159b277fec2f66183774759 --- /dev/null +++ b/creation_datasets/create_csv.py @@ -0,0 +1,50 @@ +import pandas as pd +import requests +from tqdm import tqdm +import argparse + +# writes in the csv the informations to download the files for all species of a country with more than 150 (can be modified) recordings + +parser = argparse.ArgumentParser() +parser.add_argument( + "country", help="The country for which you want to create the database. You may chose among this list : ['Arab Emirates', 'Algeria', 'Andorra', 'Angola', 'Antarctica', 'Argentina', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Belarus', 'Belgium', 'Belize', 'Bhutan', 'Bolivia', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Cambodia', 'Canada', 'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Congo (Brazzaville)', 'Congo (Democratic Republic)', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'East Timor', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Estonia', 'Ethiopia', 'Finland', 'France', 'French Guiana', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Ireland', 'Israel', 'Italy', 'Ivory Coast', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kyrgyzstan', 'Laos', 'Latvia', 'Liberia', 'Lithuania', 'Macedonia', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Malta', 'Mexico', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Nigeria', 'Norway', 'Oman', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Romania', 'Russian Federation', 'Rwanda', 'Sao Tome', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia', 'Solomon Islands', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Suriname', 'Sweden', 'Switzerland', 'Taiwan', 'Tajikistan', 'Tanzania', 'Thailand', 'Tunisia', 'Turkey', 'Uganda', 'Ukraine', 'United Kingdom', 'United States', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela', 'Vietnam', 'Zambia', 'Zimbabwe']") +args = parser.parse_args() + +country = args.country +country = country.replace(" ", "_") + +df = pd.DataFrame() + +response = requests.get( + f'https://xeno-canto.org/api/2/recordings?query=cnt:{country}') +js = response.json() +ids, files, extensions, ens, lengths, gens, cnts = [], [], [], [], [], [], [] +for n_page in tqdm(range(1, js["numPages"]+1)): + response = requests.get( + f'https://xeno-canto.org/api/2/recordings?query=cnt:{country}&page={n_page}') + page_js = response.json() + for recording in page_js["recordings"]: + if not pd.isnull(recording["file-name"]): + ids.append(recording["id"]) + files.append(recording["file"]) + extensions.append(recording["file-name"][-4:].lower()) + ens.append(recording["en"]) + lengths.append(recording["length"]) + gens.append(recording["gen"]) + cnts.append(recording["cnt"]) +df_ = pd.DataFrame.from_records( + {'id': ids, "file": files, "extension": extensions, "en": ens, "gen": gens, "length": lengths, "cnt": cnts}) +df = df.append(df_) + +# removing recordings that are not classified +df = df[df['en'] != 'Identity unknown'].copy() +df = df[df['en'] != 'Soundscape'].copy() + +counts = df['en'].value_counts() + + +# change the value if you wish to keep more or less species +chosen = counts[counts >= 150].index +df = df[df["en"].isin(chosen)] + +df.to_csv("creation_datasets/fichiers_csv/"+country+".csv", index=False) diff --git a/creation_datasets/api_download_train.py b/creation_datasets/download_files.py similarity index 60% rename from creation_datasets/api_download_train.py rename to creation_datasets/download_files.py index 01cc6dbd3f39668880105f10c5272aa219f70d11..f2718d9ca5c8da1495b0ed38775823f722dc0ae5 100644 --- a/creation_datasets/api_download_train.py +++ b/creation_datasets/download_files.py @@ -4,44 +4,44 @@ from os import listdir, mkdir, makedirs, path as os_path from tqdm import tqdm from time import sleep import requests +import argparse -# télécharge les 50 premiers fichiers audio pour chaque espèce du fichier csv birds_france_train.csv qui compte plus de 150 individus +parser = argparse.ArgumentParser() +parser.add_argument("csv_name", + help="Write the name of the csv (with the extension) from which you wish to download the files") +args = parser.parse_args() -df = read_csv("creation_datasets/fichiers_csv/birds_france_train.csv") +csv_name = args.csv_name -# Removing recordings that are not classified +# download the 50 (can be modified) first files for each specie of the csv +# during the train phase we used 50 files, and during the test phase we used 5 files +df = read_csv("creation_datasets/fichiers_csv/"+csv_name) -print(df.shape) print(len(df["en"].unique()), "different species") -print(df) + +makedirs('creation_datasets/audio_files_'+csv_name[:-4], exist_ok='True') +audio_dir = 'creation_datasets/audio_files_'+csv_name[:-4]+'/' + +nots = [] nombre_audio = {} for specie in df["en"].unique(): nombre_audio[specie] = 0 -print(nombre_audio) - -makedirs('creation_datasets/audio_files_france_train', exist_ok='True') -audio_dir = 'creation_datasets/audio_files_france_train/' - -nots = [] for row in tqdm(df.iterrows(), total=df.shape[0]): - url = row[1]["file"] - print(url) + url = row[1]["file"] specie = row[1]["en"] + # change here to modify the number of files to download for each specie if nombre_audio[specie] < 50: - f_name = audio_dir+str(row[1]["id"]) + \ - row[1]["extension"] - print(f_name) + f_name = audio_dir + str(row[1]["id"]) + row[1]["extension"] + print("Downloading", f_name) nombre_audio[specie] += 1 - print(nombre_audio[specie]) try: - #urlretrieve('https:'+url, filename=audio_dir + f_name) response = requests.get(url) open(f_name, 'wb').write(response.content) @@ -50,12 +50,10 @@ for row in tqdm(df.iterrows(), total=df.shape[0]): print(e) sleep(1) try: - #urlretrieve('https:'+url, filename=audio_dir + f_name) response = requests.get(url) open(f_name, 'wb').write(response.content) except Exception as ee: print("Not downloaded|", f_name) - print(ee) nots.append(row[1]["id"]) pass