The name of the API of each company's voice recognition service (the name varies from person to person, but I'm sorry if it's wrong)
You can find it by searching for the API called Speech-to-Text. Only Amazon is called Transcribe ...
*** This article is based on the premise that preparations for using each API (account registration, etc.) have been completed *** *** There is a site that explains how to register an account in an easy-to-understand manner by searching, so please do your best to register ***
-There is also an article comparing recognition accuracy
――First, you need to put the audio file you want to recognize in the AWS cloud storage called S3.
--Click *** Create bucket *** near the top left of the S3 page to create a bucket. The bucket name can be anything. The region is tokyo.
--Bucket name: recognition test (anything is fine)
- Region: Asia Pacific(Tokyo)
--Once you have created a bucket, upload the audio file to the created bucket and you are ready to go. There is a button called *** Upload *** near the upper left, so you can upload it by clicking it. You can also create folders to combine multiple audio files, so do whatever you like.
--Then, we will start recognizing, but before hitting the API, it is necessary to pass through environment variables such as the access key of the API.
% export AWS_DEFAULT_REGION=ap-northeast-1
% source ~/.zshrc
--All you have to do is hit the API. Here is a sample program.
from __future__ import print_function
import os,sys
import time
import boto3
import glob
from pprint import pprint
import re
import requests
import json
def extract_url(response):
Extract the url information of S3 that saves the transcription from the response of api
p = re.compile(r'(?:\{\'TranscriptFileUri\':[ ]\')(.*?)(?:\'\}\,)')
url = re.findall(p,str(response))[0]
return url
def get_json_result(url):
Download information including recognition results in json format from the cloud
r = requests.get(url)
return str(r.text)
except requests.exceptions.RequestException as err:
def extract_recognition_result(_json):
Extract only the recognition result from the json including the recognition result
json_dict = json.loads(_json)
recognized_result = json_dict['results']['transcripts'][0]['transcript']
return recognized_result
def main():
#Working directory
_dir = '/Users/RecognitionTest'
#Recognition result storage directory(Create a directory called AWS in advance. Response information from the API and transcription results are saved in this directory.)
recognition_result = _dir+'/AWS'
#A text file containing the name of the audio file you want to recognize
speech_fname_file = _dir+'/speech_fname.txt'
#Store the audio file name you want to recognize in the list
speech_fname_list = []
with open(speech_fname_file,'r') as f:
path = f.readline()
while path:
path = f.readline()
status_file = recognition_result+'/status.txt'
json_file = recognition_result+'/json_response.txt'
recog_result = recognition_result+'/recognition_result.txt' #File for saving recognition results
with open(status_file,'w') as status_out:
for speech_fname in speech_fname_list:
transcribe = boto3.client('transcribe')
job_name = str(speech_fname) #Audio file name(It doesn't have to be an audio file name, it can be anything)
job_uri = f'https://[Bucket name]{job_name}' # Bucket name -> recongnitiontest
Media={'MediaFileUri': job_uri},
while True:
# status:Response information(Includes the URL of the S3 cloud where the recognition results are stored)
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
print("Not ready yet...")
status_out.write(f'{speech_fname} {status}\n')
with open(status_file,'r') as status_in, open(json_file,'w') as json_out, open(recog_result,'w') as result:
status_list = status_in.readlines()
client = boto3.client('transcribe')
for status in status_list:
job_name = status.strip().split(' ')[0]
response = client.get_transcription_job(TranscriptionJobName=job_name)
url = extract_url(response)
_json = get_json_result(url)
recog_text = extract_recognition_result(_json)
json_out.write(f'{job_name} {_json}\n')
result.write(f'{job_name} {recog_text}\n')
if __name__ == "__main__":
--Speech_fname.txt is a text file that describes the name of the audio file you want to recognize. *** Must be the same as the audio file name placed in the S3 bucket. *** An example is given below. This is an example when you want to recognize five separate audio files. It is okay if you put the same audio file as this audio file name in the S3 cloud storage.
--Once you have set the environment variables such as API access key and prepared speech_fname.txt, execute to start recognition.
--The recognition result is written in / Users / RecognitionTest / GCP / recognition_result.txt
--Can also be confirmed on Amazon Transcribe
--Unlike Amazon Transcribe, you don't have to put the voice you want to recognize in the cloud. --Can recognize local voice --Prepare a text file that describes the path of the audio file you want to recognize. --Speech_data_path.txt in the sample program below --Use the same for IBM Watson and Microsoft Azure --Then, first pass the API key through the environment variable. API key information can be found in the json file. This json file needs to be downloaded from the GCP console. You can download the authentication information in json format by going to *** API and Services *** in the navigation menu.
% export GOOGLE_APPLICATION_CREDENTIALS="[path to json file]"
% source ~/.zshrc
--All you have to do is hit the API. Here is a sample program.
import io
import glob
import os
import shutil
from import speech_v1p1beta1
from import enums
def main():
client = speech_v1p1beta1.SpeechClient()
#Working directory
_dir = '/Users/RecognitionTest'
#Recognition result storage directory
recognition_result = _dir+'/GCP'
#A text file containing the path to the audio file you want to recognize
speech_data_path_file = _dir+'/speech_data_path.txt'
#Store the path of the audio file you want to recognize in the list
speech_path_list = []
with open(speech_data_path_file,'r') as f:
path = f.readline()
while path:
path = f.readline()
recog_result_fname = recognition_result+'/recognition_result.txt' #File for saving recognition results
with open(recog_result_fname,'w') as recog_result:
for speech_path in speech_path_list:
#Get audio file name(Used for the file name to write the recognition result)
speech_file_name = speech_path.split('/')[-1].split('.')[0] #Change the audio file name to the file name for writing the recognition result
# The use case of the audio, e.g. PHONE_CALL, DISCUSSION, PRESENTATION, et al.
interaction_type = enums.RecognitionMetadata.InteractionType.DISCUSSION
# The kind of device used to capture the audio
recording_device_type = enums.RecognitionMetadata.RecordingDeviceType.RECORDING_DEVICE_TYPE_UNSPECIFIED
# The device used to make the recording.
# Arbitrary string, e.g. 'Pixel XL', 'VoIP', 'Cardioid Microphone', or other
# value.
recording_device_name = "MR"
metadata = {
"interaction_type": interaction_type,
"recording_device_type": recording_device_type,
"recording_device_name": recording_device_name,
# The language of the supplied audio. Even though additional languages are
# provided by alternative_language_codes, a primary language is still required.
language_code = "ja-JP" #Set language to Japanese
config = {"metadata": metadata, "language_code": language_code}
with, "rb") as f:
content =
audio = {"content": content}
#Start recognition
response = client.recognize(config, audio)
#Saving and displaying recognition results
for result in response.results:
# First alternative is the most probable result
alternative = result.alternatives[0]
print(u"Transcript: {}".format(alternative.transcript))
recog_result.write(u"{} {}".format(speech_file_name,alternative.transcript)+'\n')
if __name__ == "__main__":
--The recognition result is written in / Users / RecognitionTest / GCP / recognition_result.txt
--The usage is almost the same as Google Gloud Speech-to-Text. ―― *** However, the difference is that the API key etc. are described in the program. *** Google and Amazon set it as an environment variable. --You need to get the API key and the URL of the endpoint. --This is a sample program -*** Please replace [your API key] *** and *** [endpoint URL] *** --It is better to specify "jp-tok" as the endpoint.
import os,sys
import glob
import re
import json
from os.path import join, dirname
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from pprint import pprint
import shutil
import jaconv
def extract_recognition_result(_json):
recognized_result = []
json_dict = json.loads(_json)
transcript = json_dict['results'][0]['alternatives'][0]['transcript'].split(' ')
return ' '
#Since the stagnation word is written in katakana, it is converted to hiragana notation.
for word in transcript:
if 'D_' in word:
recognized_result = ' '.join(recognized_result)
recognized_result = recognized_result.replace('D_','') #The stagnation is'D_'Since it is expressed in, delete it
return str(recognized_result)
def main():
#Working directory
_dir = '/Users/RecognitionTest'
#Recognition result storage directory
recognition_result = _dir+'/Watson'
#A text file containing the path to the audio file you want to recognize
speech_data_path_file = _dir+'/speech_data_path.txt'
#Store the path of the audio file you want to recognize in the list
speech_path_list = []
with open(speech_data_path_file,'r') as f:
path = f.readline()
while path:
path = f.readline()
#json file(Recognition result)Storage directory
json_result_dir = recognition_result+'/json_result'
for speech_path in speech_path_list:
#Get audio file name(Used for the file name to write the recognition result)
speech_file_name = speech_path.split('/')[-1].split('.')[0]
with open(f'{json_result_dir}/{speech_file_name}.json','w') as json_out:
# set apikey
authenticator = IAMAuthenticator('[My API key]')
service = SpeechToTextV1(authenticator=authenticator)
# set endpoint url
service.set_service_url('[URL of the endpoint]')
lang = 'ja-JP_BroadbandModel' #Set language to Japanese
with open(speech_path,'rb') as audio_file:
result_json = service.recognize(audio=audio_file, content_type='audio/wav', timestamps=True, model=lang, word_confidence=True, end_of_phrase_silence_time=30.0)
result_json = result_json.get_result()
#Since the recognition result in json format is acquired, json_resultX.Write to json
result = json.dumps(result_json, indent=2, ensure_ascii=False)
json_file_list = glob.glob(json_result_dir+'/*.json')
recog_result_file = recognition_result+'/recognition_result.txt'
with open(recog_result_file,'w') as result:
for json_file in json_file_list:
with open(json_file,'r') as _json:
speech_file_name = json_file.strip().split('/')[-1].split('.')[0]
#Saved json_resultX.Extract only recognition result from json
recog_result = extract_recognition_result(
result.write(f'{speech_file_name} {recog_result}\n')
if __name__ == "__main__":
--Watson adds "D_" to the stagnant part by default. This part is removed in the sample program. Also, the stagnation word is written in katakana, but I needed the hiragana notation, so I converted it.
--The recognition result is written in / Users / RecognitionTest / GCP / recognition_result.txt
--Azure describes the API key (speech_key) and region in the program like IBM Watson. --speech_key: *** Please rewrite [your speech_key] *** --service_region: *** japaneast ***
import time
import wave
import glob
import re
import os
import azure.cognitiveservices.speech as speechsdk
except ImportError:
Importing the Speech SDK for Python failed.
Refer to for
installation instructions.
import sys
# Set up the subscription info for the Speech Service:
# Replace with your own subscription key and service region (e.g., "westus").
#Service area(service_region)To eastern Japan(japaneast)Set to
# speech_key ->Check on My Page of Azure
speech_key, service_region = "[My speech_key]", "japaneast"
# Specify the path to an audio file containing speech (mono WAV / PCM with a sampling rate of 16
# kHz).
def main():
"""performs continuous speech recognition with input from an audio file"""
# <SpeechContinuousRecognitionWithFile>
#Working directory
_dir = '/Users/RecognitionTest'
#Recognition result storage directory
recognition_result = _dir+'/Azure'
#A text file containing the path to the audio file you want to recognize
speech_data_path_file = _dir+'/speech_data_path.txt'
#Store the path of the audio file you want to recognize in the list
speech_path_list = []
with open(speech_data_path_file,'r') as f:
path = f.readline()
while path:
path = f.readline()
#Create file for writing recognition result(pre_result.Information other than the recognition result is also written in txt)
with open(f'{recognition_result}/pre_result.txt','w') as recog_result:
for speech_path in speech_path_list:
speech_file_name = speech_path.split('/')[-1].split('.')[0] #Use audio file name as file name for writing recognition result
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config =
speech_config.speech_recognition_language="ja-JP" #Set language to Japanese
profanity_option = speechsdk.ProfanityOption(2) #Inappropriate speech processing 0->hide, 1->Delete, 2->Including
speech_config.set_profanity(profanity_option=profanity_option) # profanity_Change option
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
#Writing recognition result
speech_recognizer.recognized.connect(lambda evt: recog_result.write('{} RECOGNIZED: {}'.format(speech_file_name,evt)+'\n'))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
# Start continuous speech recognition
while not done:
# </SpeechContinuousRecognitionWithFile>
def fix_recognition_result():
- pre_result.txt is the recognition result in the following format
- [SPEECH FILE NAME] RECOGNIZED: SpeechRecognitionEventArgs(session_id=XXX, result=SpeechRecognitionResult(result_id=YYY, text="[Recognition result]", reason=ResultReason.RecognizedSpeech))
- [SPEECH FILE NAME]When[Recognition result]Extract only the part of
#Recognition result file
pre_result = '/Users/kamiken/speech_recognition_data/Cloud_Speech_to_Text/Compare4Kaldi/Compare_Test1/Azure/pre_result.txt'
#Information other than recognition results(Parameters etc.)Delete
with open(pre_result,'r') as pre, open(pre_result.replace('pre_',''),'w') as result:
lines = pre.readlines()
for line in lines:
split_line = line.strip().split(' ')
speech_file_name = split_line[0]
text = str(re.findall('text=\"(.*)\",',' '.join(split_line[1:]))[0])+'\n'
result.write(f'{speech_file_name} {text}')
if __name__ == "__main__":
--Azure kindly hides inappropriate remarks such as F word with an asterisk like "***". I had to calculate the WER, so the sample program is set to display all of them.
- profanity_option = speechsdk.ProfanityOption(2)
--Arguments are 0 (hide inappropriate statements with an asterisk), 1 (delete), 2 (do not hide or delete)
--The recognition result is written in / Users / RecognitionTest / GCP / recognition_result.txt
